diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..bd764a800a9d80da448fe912b9e8263364fdc229
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,131 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/\
+
+flagged/
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..b076d86084a9743afbd07dac765b7fdabb8e064f
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2022, Aastha Singh
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
index 91e7a1244e05851f5bc1073302d94ebb97f2321d..7fd235c0f0d7f48e345b3fb1c9eae7903a67cdd3 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,94 @@
----
-title: GLIP BLIP Object Detection VQA
-emoji: 📊
-colorFrom: indigo
-colorTo: pink
-sdk: gradio
-sdk_version: 3.4.1
-app_file: app.py
-pinned: false
-license: bsd-3-clause
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Vision-Language Object Detection and Visual Question Answering
+This repository includes Microsoft's GLIP and Salesforce's BLIP ensembled demo for detecting objects and Visual Question Answering based on text prompts.
+
+
+
+## About GLIP: Grounded Language-Image Pre-training -
+> GLIP demonstrate strong zero-shot and few-shot transferability to various object-level recognition tasks.
+
+> The model used in this repo is GLIP-T, it is originally pre-trained on Conceptual Captions 3M and SBU captions.
+
+
+
+## About BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation -
+
+> A new model architecture that enables a wider range of downstream tasks than existing methods, and a new dataset bootstrapping method for learning from noisy web data.
+
+
+
+## Installation and Setup
+
+***Enviornment*** - Due to limitations with `maskrcnn_benchmark`, this repo requires Pytorch=1.10 and torchvision.
+
+Use `requirements.txt` to install dependencies
+
+```sh
+pip3 install -r requirements.txt
+```
+Build `maskrcnn_benchmark`
+```
+python setup.py build develop --user
+```
+
+To verify a successful build, check the terminal for message
+"Finished processing dependencies for maskrcnn-benchmark==0.1"
+
+## Checkpoints
+
+> Download the pre-trained models into the `checkpoints` folder.
+
+
+
+```sh
+mkdir checkpoints
+cd checkpoints
+```
+
+Model | Weight
+-- | --
+**GLIP-T** | [weight](https://drive.google.com/file/d/1nlPL6PHkslarP6RiWJJu6QGKjqHG4tkc/view?usp=sharing)
+**BLIP** | [weight](https://drive.google.com/file/d/1QliNGiAcyCCJLd22eNOxWvMUDzb7GzrO/view?usp=sharing)
+
+
files.maxMemoryForLargeFilesMB
+
+## If you have an NVIDIA GPU with 8GB VRAM, run local demo using Gradio interface
+
+```sh
+python3 app.py
+```
+## Future Work
+
+- [x] Frame based Visual Question Answering
+- [ ] Each object based Visual Question Answering
+
+
+## Citations
+
+```txt
+@inproceedings{li2022blip,
+ title={BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
+ author={Junnan Li and Dongxu Li and Caiming Xiong and Steven Hoi},
+ year={2022},
+ booktitle={ICML},
+}
+@inproceedings{li2021grounded,
+ title={Grounded Language-Image Pre-training},
+ author={Liunian Harold Li* and Pengchuan Zhang* and Haotian Zhang* and Jianwei Yang and Chunyuan Li and Yiwu Zhong and Lijuan Wang and Lu Yuan and Lei Zhang and Jenq-Neng Hwang and Kai-Wei Chang and Jianfeng Gao},
+ year={2022},
+ booktitle={CVPR},
+}
+@article{zhang2022glipv2,
+ title={GLIPv2: Unifying Localization and Vision-Language Understanding},
+ author={Zhang, Haotian* and Zhang, Pengchuan* and Hu, Xiaowei and Chen, Yen-Chun and Li, Liunian Harold and Dai, Xiyang and Wang, Lijuan and Yuan, Lu and Hwang, Jenq-Neng and Gao, Jianfeng},
+ journal={arXiv preprint arXiv:2206.05836},
+ year={2022}
+}
+@article{li2022elevater,
+ title={ELEVATER: A Benchmark and Toolkit for Evaluating Language-Augmented Visual Models},
+ author={Li*, Chunyuan and Liu*, Haotian and Li, Liunian Harold and Zhang, Pengchuan and Aneja, Jyoti and Yang, Jianwei and Jin, Ping and Lee, Yong Jae and Hu, Houdong and Liu, Zicheng and others},
+ journal={arXiv preprint arXiv:2204.08790},
+ year={2022}
+}
+```
+## Acknowledgement
+The implementation of this work relies on resources from BLIP, GLIP, Huggingface Transformers, and timm. We thank the original authors for their open-sourcing.
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f6a03c5bcd0e21ebcacac2453c128052f9deac0
--- /dev/null
+++ b/app.py
@@ -0,0 +1,57 @@
+import os
+import gradio as gr
+import warnings
+
+warnings.filterwarnings("ignore")
+
+os.system("python setup.py build develop --user")
+
+from maskrcnn_benchmark.config import cfg
+from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo
+import vqa
+import vqa
+
+# Use this command for evaluate the GLIP-T model
+config_file = "configs/glip_Swin_T_O365_GoldG.yaml"
+weight_file = "checkpoints/glip_tiny_model_o365_goldg_cc_sbu.pth"
+
+# manual override some options
+cfg.local_rank = 0
+cfg.num_gpus = 1
+cfg.merge_from_file(config_file)
+cfg.merge_from_list(["MODEL.WEIGHT", weight_file])
+cfg.merge_from_list(["MODEL.DEVICE", "cuda"])
+
+glip_demo = GLIPDemo(
+ cfg,
+ min_image_size=800,
+ confidence_threshold=0.7,
+ show_mask_heatmaps=False
+)
+blip_demo = vqa.VQA(
+ model_path = 'checkpoints/model_base_vqa_capfilt_large.pth'
+)
+
+def predict(image, object, question):
+ result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
+ answer = blip_demo.vqa_demo(image, question)
+ return result[:, :, [2, 1, 0]], answer
+
+image = gr.inputs.Image()
+
+gr.Interface(
+ description="GLIP + BLIP VQA Demo.",
+ fn=predict,
+ inputs=[
+ "image",
+ gr.Textbox(label='Objects', lines=1, placeholder="Objects here.."),
+ gr.Textbox(label='Question', lines=1, placeholder="Question here..")],
+
+ outputs=[
+ gr.outputs.Image(
+ type="pil",
+ label="grounding results"
+ ),
+ gr.Textbox(label="Answer")
+ ],
+).launch()
\ No newline at end of file
diff --git a/checkpoints/glip_tiny_model_o365_goldg_cc_sbu.pth b/checkpoints/glip_tiny_model_o365_goldg_cc_sbu.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d05b8d5d3318107871c13ca068ee094644600779
--- /dev/null
+++ b/checkpoints/glip_tiny_model_o365_goldg_cc_sbu.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bec0a3dea804fcb278d7106c5438de5116ee888e49dfae46270e7ad7bc4ccbf
+size 3710104213
diff --git a/checkpoints/model_base_vqa_capfilt_large.pth b/checkpoints/model_base_vqa_capfilt_large.pth
new file mode 100644
index 0000000000000000000000000000000000000000..df8c62ad684ab84409a19a947cd33b920b78b5ad
--- /dev/null
+++ b/checkpoints/model_base_vqa_capfilt_large.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7a7d546209f1ccfa8b3cd3a0138c53e0d1e95e4a4bc280bef8f67e20fe4925ae
+size 1446244375
diff --git a/configs/glip_Swin_T_O365_GoldG.yaml b/configs/glip_Swin_T_O365_GoldG.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..80b9edba1b47a83f5da99254dd081dac3f80354a
--- /dev/null
+++ b/configs/glip_Swin_T_O365_GoldG.yaml
@@ -0,0 +1,100 @@
+MODEL:
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
+ WEIGHT: "swin_tiny_patch4_window7_224.pth"
+ RPN_ONLY: True
+ RPN_ARCHITECTURE: "VLDYHEAD"
+
+ BACKBONE:
+ CONV_BODY: "SWINT-FPN-RETINANET"
+ OUT_CHANNELS: 256
+ FREEZE_CONV_BODY_AT: -1
+
+ LANGUAGE_BACKBONE:
+ FREEZE: False
+ MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
+ MASK_SPECIAL: False
+
+ RPN:
+ USE_FPN: True
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+ ASPECT_RATIOS: (1.0,)
+ SCALES_PER_OCTAVE: 1
+
+ DYHEAD:
+ CHANNELS: 256
+ NUM_CONVS: 6
+ USE_GN: True
+ USE_DYRELU: True
+ USE_DFCONV: True
+ USE_DYFUSE: True
+ TOPK: 9 # topk for selecting candidate positive samples from each level
+ SCORE_AGG: "MEAN"
+ LOG_SCALE: 0.0
+
+ FUSE_CONFIG:
+ EARLY_FUSE_ON: True
+ TYPE: "MHA-B"
+ USE_CLASSIFICATION_LOSS: False
+ USE_TOKEN_LOSS: False
+ USE_CONTRASTIVE_ALIGN_LOSS: False
+ CONTRASTIVE_HIDDEN_DIM: 64
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
+ USE_FUSED_FEATURES_DOT_PRODUCT: True
+ USE_LAYER_SCALE: True
+ CLAMP_MIN_FOR_UNDERFLOW: True
+ CLAMP_MAX_FOR_OVERFLOW: True
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+ CLAMP_DOT_PRODUCT: True
+
+ USE_CHECKPOINT: True
+
+TEST:
+ DURING_TRAINING: False
+ IMS_PER_BATCH: 64
+
+# use for grounding model
+DATASETS:
+ TRAIN: ("object365_dt_train", "mixed_train_no_coco", "flickr30k_train", )
+ TEST: ("coco_2017_val", )
+ DISABLE_SHUFFLE: False
+ ADD_DET_PROMPT: False
+ RANDOM_SAMPLE_NEG: 85
+ CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
+
+ SEPARATION_TOKENS: ". "
+
+INPUT:
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+ MIN_SIZE_TRAIN: 800
+ MAX_SIZE_TRAIN: 1333
+ MIN_SIZE_TEST: 800
+ MAX_SIZE_TEST: 1333
+
+AUGMENT:
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+
+DATALOADER:
+ SIZE_DIVISIBILITY: 32
+
+SOLVER:
+ OPTIMIZER: ADAMW
+ BASE_LR: 0.0001
+ LANG_LR: 0.00001
+ WEIGHT_DECAY: 0.0001
+ STEPS: (0.67, 0.89)
+ MAX_EPOCH: 30
+ IMS_PER_BATCH: 64
+ WARMUP_ITERS: 2000
+ WARMUP_FACTOR: 0.001
+ USE_AMP: True
+ MODEL_EMA: 0.999
+ FIND_UNUSED_PARAMETERS: False
+
+ CLIP_GRADIENTS:
+ ENABLED: True
+ CLIP_TYPE: "full_model"
+ CLIP_VALUE: 1.0
+ NORM_TYPE: 2.0
\ No newline at end of file
diff --git a/configs/med_config.json b/configs/med_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..0ffad0a6f3c2f9f11b8faa84529d9860bb70327a
--- /dev/null
+++ b/configs/med_config.json
@@ -0,0 +1,21 @@
+{
+ "architectures": [
+ "BertModel"
+ ],
+ "attention_probs_dropout_prob": 0.1,
+ "hidden_act": "gelu",
+ "hidden_dropout_prob": 0.1,
+ "hidden_size": 768,
+ "initializer_range": 0.02,
+ "intermediate_size": 3072,
+ "layer_norm_eps": 1e-12,
+ "max_position_embeddings": 512,
+ "model_type": "bert",
+ "num_attention_heads": 12,
+ "num_hidden_layers": 12,
+ "pad_token_id": 0,
+ "type_vocab_size": 2,
+ "vocab_size": 30524,
+ "encoder_width": 768,
+ "add_cross_attention": true
+}
diff --git a/configs/vqa.yaml b/configs/vqa.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..74327e6d0a34672023b44569558fe8beeb052548
--- /dev/null
+++ b/configs/vqa.yaml
@@ -0,0 +1,25 @@
+vqa_root: '/export/share/datasets/vision/VQA/Images/mscoco/' #followed by train2014/
+vg_root: '/export/share/datasets/vision/visual-genome/' #followed by image/
+train_files: ['vqa_train','vqa_val','vg_qa']
+ann_root: 'annotation'
+
+# set pretrained as a file path or an url
+pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth'
+
+# size of vit model; base or large
+vit: 'base'
+batch_size_train: 16
+batch_size_test: 32
+vit_grad_ckpt: False
+vit_ckpt_layer: 0
+init_lr: 2e-5
+
+image_size: 480
+
+k_test: 128
+inference: 'rank'
+
+# optimizer
+weight_decay: 0.05
+min_lr: 0
+max_epoch: 10
\ No newline at end of file
diff --git a/itm.py b/itm.py
new file mode 100644
index 0000000000000000000000000000000000000000..6da8af6dfe782beff41de4efb952f481fa97a6c6
--- /dev/null
+++ b/itm.py
@@ -0,0 +1,77 @@
+import sys
+from PIL import Image
+import torch
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+from models.blip_vqa import blip_vqa
+from models.blip_itm import blip_itm
+
+
+class VQA:
+ def __init__(self, model_path, image_size=480):
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+ self.model = blip_vqa(pretrained=model_path, image_size=image_size, vit='base')
+ self.model.eval()
+ self.model = self.model.to(self.device)
+
+ def load_demo_image(self, image_size, img_path, device):
+ raw_image = Image.open(img_path).convert('RGB')
+ w,h = raw_image.size
+ transform = transforms.Compose([
+ transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
+ transforms.ToTensor(),
+ transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+ ])
+ image = transform(raw_image).unsqueeze(0).to(device)
+ return raw_image, image
+
+ def vqa(self, img_path, question):
+ raw_image, image = self.load_demo_image(image_size=480, img_path=img_path, device=self.device)
+ with torch.no_grad():
+ answer = self.model(image, question, train=False, inference='generate')
+ return answer[0]
+class ITM:
+ def __init__(self, model_path, image_size=384):
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+ self.model = blip_itm(pretrained=model_path, image_size=image_size, vit='base')
+ self.model.eval()
+ self.model = self.model.to(device='cpu')
+
+ def load_demo_image(self, image_size, img_path, device):
+ raw_image = Image.open(img_path).convert('RGB')
+ w,h = raw_image.size
+ transform = transforms.Compose([
+ transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
+ transforms.ToTensor(),
+ transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+ ])
+ image = transform(raw_image).unsqueeze(0).to(device)
+ return raw_image, image
+
+ def itm(self, img_path, caption):
+ raw_image, image = self.load_demo_image(image_size=384,img_path=img_path, device=self.device)
+ itm_output = self.model(image,caption,match_head='itm')
+ itm_score = torch.nn.functional.softmax(itm_output,dim=1)[:,1]
+ itc_score = self.model(image,caption,match_head='itc')
+ # print('The image and text is matched with a probability of %.4f'%itm_score)
+ # print('The image feature and text feature has a cosine similarity of %.4f'%itc_score)
+ return itm_score, itc_score
+
+if __name__=="__main__":
+ if not len(sys.argv) == 3:
+ print('Format: python3 vqa.py ')
+ print('Sample: python3 vqa.py sample.jpg "What is the color of the horse?"')
+
+ else:
+ model_path = 'checkpoints/model_base_vqa_capfilt_large.pth'
+ model2_path = 'model_base_retrieval_coco.pth'
+ # vqa_object = VQA(model_path=model_path)
+ itm_object = ITM(model_path=model2_path)
+ img_path = sys.argv[1]
+ # question = sys.argv[2]
+ caption = sys.argv[2]
+ # answer = vqa_object.vqa(img_path, caption)
+ itm_score, itc_score = itm_object.itm(img_path, caption)
+ # print('Question: {} | Answer: {}'.format(caption, answer))
+ print('Caption: {} | The image and text is matched with a probability of %.4f: {} | The image feature and text feature has a cosine similarity of %.4f: {}'.format (caption,itm_score,itc_score))
+
diff --git a/maskrcnn_benchmark/__init__.py b/maskrcnn_benchmark/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bc96c7a6bf8379e1adfb3e4adf536107b385fa9
--- /dev/null
+++ b/maskrcnn_benchmark/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/maskrcnn_benchmark/config/__init__.py b/maskrcnn_benchmark/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2015d6bd830bc3e0ec8b1ca7fcb63b4781a41ad
--- /dev/null
+++ b/maskrcnn_benchmark/config/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from .defaults import _C as cfg
+from .paths_catalog import try_to_find
\ No newline at end of file
diff --git a/maskrcnn_benchmark/config/defaults.py b/maskrcnn_benchmark/config/defaults.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd62a9ea307b727e0db06985264707046e8c7234
--- /dev/null
+++ b/maskrcnn_benchmark/config/defaults.py
@@ -0,0 +1,861 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import os
+
+from yacs.config import CfgNode as CN
+
+# -----------------------------------------------------------------------------
+# Convention about Training / Test specific parameters
+# -----------------------------------------------------------------------------
+# Whenever an argument can be either used for training or for testing, the
+# corresponding name will be post-fixed by a _TRAIN for a training parameter,
+# or _TEST for a test-specific parameter.
+# For example, the number of images during training will be
+# IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be
+# IMAGES_PER_BATCH_TEST
+
+# -----------------------------------------------------------------------------
+# Config definition
+# -----------------------------------------------------------------------------
+
+_C = CN()
+
+_C.MODEL = CN()
+_C.MODEL.RPN_ONLY = False
+_C.MODEL.BOX_ON = True
+_C.MODEL.MASK_ON = False
+_C.MODEL.KEYPOINT_ON = False
+_C.MODEL.DEVICE = "cuda"
+
+_C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN"
+
+_C.MODEL.RPN_ARCHITECTURE = "RPN"
+_C.MODEL.DEBUG = False # add debug flag
+_C.MODEL.ONNX = False # add onnx flag
+
+# If the WEIGHT starts with a catalog://, like :R-50, the code will look for
+# the path in paths_catalog. Else, it will use it as the specified absolute
+# path
+_C.MODEL.WEIGHT = ""
+_C.MODEL.PRETRAIN_NAME = ""
+
+# If LINEAR_PROB = True, only the last linear layers in rpn and roi_head are trainable
+_C.MODEL.LINEAR_PROB = False
+
+# -----------------------------------------------------------------------------
+# Multitask Training / Test specific parameters
+# -----------------------------------------------------------------------------
+_C.MODEL.MULTITASK = CN(new_allowed=True)
+
+# -----------------------------------------------------------------------------
+# INPUT
+# -----------------------------------------------------------------------------
+_C.INPUT = CN()
+# Size of the smallest side of the image during training
+_C.INPUT.MIN_SIZE_TRAIN = 800 # (800,)
+# Maximum size of the side of the image during training
+_C.INPUT.MAX_SIZE_TRAIN = 1333
+# Size of the smallest side of the image during testing
+_C.INPUT.MIN_SIZE_TEST = 800
+# Maximum size of the side of the image during testing
+_C.INPUT.MAX_SIZE_TEST = 1333
+# Values to be used for image normalization
+_C.INPUT.PIXEL_MEAN = [102.9801, 115.9465, 122.7717]
+# Values to be used for image normalization
+_C.INPUT.PIXEL_STD = [1., 1., 1.]
+# Convert image to BGR format (for Caffe2 models), in range 0-255
+_C.INPUT.TO_BGR255 = True
+_C.INPUT.FORMAT = ''
+_C.INPUT.FIX_RES = False
+
+# -----------------------------------------------------------------------------
+# Augmentation
+# -----------------------------------------------------------------------------
+_C.AUGMENT = CN()
+_C.AUGMENT.USE_RA = 0
+_C.AUGMENT.FLIP_PROB_TRAIN = 0.5
+_C.AUGMENT.VERTICAL_FLIP_PROB_TRAIN = 0.0
+_C.AUGMENT.MULT_MIN_SIZE_TRAIN = ()
+
+_C.AUGMENT.BRIGHTNESS = 0.0
+_C.AUGMENT.CONTRAST = 0.0
+_C.AUGMENT.SATURATION = 0.0
+_C.AUGMENT.HUE = 0.0
+
+_C.AUGMENT.CROP_PROB = 0.5
+_C.AUGMENT.CROP_MIN_IOUS = (0.1, 0.3, 0.5, 0.7, 0.9)
+_C.AUGMENT.CROP_MIN_SIZE = 0.3
+
+# -----------------------------------------------------------------------------
+# Dataset
+# -----------------------------------------------------------------------------
+_C.DATASETS = CN()
+# List of the dataset names for training, as present in paths_catalog.py
+_C.DATASETS.TRAIN = ()
+# List of the dataset names for testing, as present in paths_catalog.py
+_C.DATASETS.TEST = ()
+# Use is_crowd label
+_C.DATASETS.USE_CROWD = False
+_C.DATASETS.CLASS_AGNOSTIC = False
+_C.DATASETS.CLASS_CONCAT = False
+_C.DATASETS.MAX_BOX = -1
+_C.DATASETS.SAMPLE_RATIO = 0.0
+_C.DATASETS.FEW_SHOT = 0
+# SHUFFLE_SEED != 0 means shuffle the dataset in the few shot setting
+_C.DATASETS.SHUFFLE_SEED = 0
+_C.DATASETS.PREDEFINED_TEXT = ''
+_C.DATASETS.ALTERNATIVE_TRAINING = False
+_C.DATASETS.MULTISTAGE_TRAINING = False
+_C.DATASETS.REGISTER = CN(new_allowed=True)
+_C.DATASETS.BOX_THRESHOLD = 0.1
+# Duplicate Dataset
+_C.DATASETS.COCO_COPY = 1
+_C.DATASETS.LVIS_COPY = 1
+_C.DATASETS.FLICKR_COPY = 1
+_C.DATASETS.MIXED_COPY = 1
+_C.DATASETS.OBJECT365_COPY = 1
+_C.DATASETS.VG_COPY = 1
+_C.DATASETS.OI_COPY = 1
+_C.DATASETS.IN_COPY = 1
+
+# Duplicate Dataset
+_C.DATASETS.COCO_COPY = 1
+_C.DATASETS.FLICKR_COPY = 1
+_C.DATASETS.MIXED_COPY = 1
+_C.DATASETS.OBJECT365_COPY = 1
+_C.DATASETS.VG_COPY = 1
+_C.DATASETS.OI_COPY = 1
+_C.DATASETS.IN_COPY = 1
+_C.DATASETS.GENERAL_COPY = -1
+_C.DATASETS.GENERAL_COPY_TEST = -1
+
+# OD to Grounding
+_C.DATASETS.RANDOM_SAMPLE_NEG = -1
+_C.DATASETS.ADD_DET_PROMPT = False
+_C.DATASETS.ADD_DET_PROMPT_ADVANCED = False
+_C.DATASETS.USE_OD_AUG = False
+_C.DATASETS.USE_COCO_FORMAT = False
+_C.DATASETS.CONTROL_PROB = ()
+_C.DATASETS.DISABLE_SHUFFLE = False
+_C.DATASETS.PROMPT_VERSION = ""
+_C.DATASETS.PROMPT_LIMIT_NEG = -1
+_C.DATASETS.POS_QUESTION_PROB = 0.6
+_C.DATASETS.NEG_QUESTION_PROB = 0.8
+_C.DATASETS.FULL_QUESTION_PROB = 0.5
+_C.DATASETS.ONE_HOT = False
+_C.DATASETS.NO_MINUS_ONE_FOR_ONE_HOT = False
+
+_C.DATASETS.DISABLE_CLIP_TO_IMAGE = False
+_C.DATASETS.SEPARATION_TOKENS = " "
+
+# LVIS
+_C.DATASETS.LVIS_USE_NORMAL_AP = False
+_C.DATASETS.SPECIAL_SAFEGUARD_FOR_COCO_GROUNDING = False
+
+# Caption
+_C.DATASETS.BING_INDEX_LIST = []
+_C.DATASETS.CAPTION_MIN_BOX = 1
+_C.DATASETS.REPLACE_CLEAN_LABEL = False
+_C.DATASETS.FURTHER_SCREEN = False
+_C.DATASETS.CAPTION_CONF = 0.9
+_C.DATASETS.CAPTION_NMS = 0.9
+_C.DATASETS.PACK_RANDOM_CAPTION_NUMBER = 0
+_C.DATASETS.INFERENCE_CAPTION = False
+_C.DATASETS.SAMPLE_NEGATIVE_FOR_GROUNDING_DATA = -1.0
+_C.DATASETS.RANDOM_PACK_PROB = -1.0
+_C.DATASETS.NO_RANDOM_PACK_PROBABILITY = 0.0
+_C.DATASETS.SAFEGUARD_POSITIVE_CAPTION = True
+_C.DATASETS.CAPTION_FORMAT_VERSION = "v1"
+_C.DATASETS.LOCAL_DEBUG = False
+
+
+# Od in the wild
+_C.DATASETS.PREDEFINED_TEXT = None
+_C.DATASETS.TRAIN_DATASETNAME_SUFFIX = ""
+_C.DATASETS.TEST_DATASETNAME_SUFFIX = ""
+_C.DATASETS.OVERRIDE_CATEGORY = None
+_C.DATASETS.USE_OVERRIDE_CATEGORY = False
+_C.DATASETS.SUPRESS_QUERY = None
+_C.DATASETS.USE_SUPRESS_QUERY = False
+_C.DATASETS.USE_CAPTION_PROMPT = False
+_C.DATASETS.CAPTION_PROMPT = None
+
+_C.DATASETS.FLICKR_GT_TYPE = "separate"
+
+# VQA
+_C.DATASETS.DIVER_BOX_FOR_VQA = False
+# -----------------------------------------------------------------------------
+# DataLoader
+# -----------------------------------------------------------------------------
+_C.DATALOADER = CN()
+# Number of data loading threads
+_C.DATALOADER.NUM_WORKERS = 4
+# If > 0, this enforces that each collated batch should have a size divisible
+# by SIZE_DIVISIBILITY
+_C.DATALOADER.SIZE_DIVISIBILITY = 0
+# If True, each batch should contain only images for which the aspect ratio
+# is compatible. This groups portrait images together, and landscape images
+# are not batched with portrait images.
+_C.DATALOADER.ASPECT_RATIO_GROUPING = True
+# Define min number of keypoints required from GT, for example 10 out of 17
+_C.DATALOADER.MIN_KPS_PER_IMS = 0
+# Use random sampler during training
+_C.DATALOADER.USE_RANDOM_SEED = False
+
+_C.DATALOADER.DISTRIBUTE_CHUNK_AMONG_NODE = False
+# ---------------------------------------------------------------------------- #
+# Backbone options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.BACKBONE = CN()
+
+# The backbone conv body to use
+# The string must match a function that is imported in modeling.model_builder
+# (e.g., 'FPN.add_fpn_ResNet101_conv5_body' to specify a ResNet-101-FPN
+# backbone)
+_C.MODEL.BACKBONE.CONV_BODY = "R-50-C4"
+
+# Add StopGrad at a specified stage so the bottom layers are frozen
+_C.MODEL.BACKBONE.FREEZE_CONV_BODY_AT = 2
+_C.MODEL.BACKBONE.FREEZE = False
+_C.MODEL.BACKBONE.GROUP = 1
+_C.MODEL.BACKBONE.OUT_CHANNELS = 256 * 4
+# Option to reset bn running statics
+_C.MODEL.BACKBONE.RESET_BN = False
+# Backbone Normalization Level
+_C.MODEL.BACKBONE.NORM_LEVEL = 3
+# BN for backbone
+_C.MODEL.BACKBONE.USE_BN = False
+# Sync BN for backbone
+_C.MODEL.BACKBONE.USE_SYNCBN = False
+_C.MODEL.BACKBONE.USE_NSYNCBN = False
+# GN for backbone
+_C.MODEL.BACKBONE.USE_GN = False
+# Evo Norm for backbone
+_C.MODEL.BACKBONE.USE_EN = False
+# Layers for backbone
+_C.MODEL.BACKBONE.USE_DFCONV = False
+_C.MODEL.BACKBONE.USE_DYRELU = False
+_C.MODEL.BACKBONE.USE_SE = False
+_C.MODEL.BACKBONE.LAYER_SETUP = (3, 4, 6, 3)
+_C.MODEL.BACKBONE.LAYER_SEARCH = CN(new_allowed=True)
+_C.MODEL.BACKBONE.OUT_FEATURES = ("stage2", "stage3", "stage4", "stage5")
+_C.MODEL.BACKBONE.FPN_LAYER = ()
+_C.MODEL.BACKBONE.USE_CHECKPOINT = False
+# Add JF efficient det cfgs
+_C.MODEL.BACKBONE.EFFICIENT_DET_START_FROM = 3
+_C.MODEL.BACKBONE.EFFICIENT_DET_COMPOUND = 0
+_C.MODEL.BACKBONE.EFFICIENT_DET_BIFPN_VERSION = 0
+
+_C.MODEL.LANGUAGE_BACKBONE = CN()
+_C.MODEL.LANGUAGE_BACKBONE.WEIGHT = ""
+_C.MODEL.LANGUAGE_BACKBONE.FREEZE = False
+_C.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT = False
+_C.MODEL.LANGUAGE_BACKBONE.TOKENIZER_TYPE = "bert-base-uncased"
+_C.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE = "bert-base-uncased"
+_C.MODEL.LANGUAGE_BACKBONE.LANG_DIM = 768
+_C.MODEL.LANGUAGE_BACKBONE.MAX_QUERY_LEN = 256
+_C.MODEL.LANGUAGE_BACKBONE.N_LAYERS = 1
+_C.MODEL.LANGUAGE_BACKBONE.UNUSED_TOKEN = 106
+_C.MODEL.LANGUAGE_BACKBONE.MASK_SPECIAL = False
+
+_C.MODEL.LANGUAGE_BACKBONE.RNN_TYPE = "lstm"
+_C.MODEL.LANGUAGE_BACKBONE.VARIABLE_LENGTH = True
+_C.MODEL.LANGUAGE_BACKBONE.WORD_EMBEDDING_SIZE = 512
+_C.MODEL.LANGUAGE_BACKBONE.WORD_VEC_SIZE = 512
+_C.MODEL.LANGUAGE_BACKBONE.HIDDEN_SIZE = 512
+_C.MODEL.LANGUAGE_BACKBONE.BIDIRECTIONAL = True
+_C.MODEL.LANGUAGE_BACKBONE.INPUT_DROPOUT_P = 0.5
+_C.MODEL.LANGUAGE_BACKBONE.DROPOUT_P = 0.2
+_C.MODEL.LANGUAGE_BACKBONE.CORPUS_PATH = ""
+_C.MODEL.LANGUAGE_BACKBONE.VOCAB_SIZE = 0
+
+_C.MODEL.LANGUAGE_BACKBONE.PAD_MAX = True
+# ---------------------------------------------------------------------------- #
+# FPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.FPN = CN()
+_C.MODEL.FPN.FREEZE = False
+_C.MODEL.FPN.USE_GN = False
+_C.MODEL.FPN.USE_RELU = False
+_C.MODEL.FPN.USE_DYRELU = False
+_C.MODEL.FPN.DROP_BLOCK = True
+_C.MODEL.FPN.DROP_PROB = 0.3
+_C.MODEL.FPN.DROP_SIZE = 3
+_C.MODEL.FPN.USE_SPP = False
+_C.MODEL.FPN.USE_PAN = False
+_C.MODEL.FPN.USE_DYHEAD = False
+_C.MODEL.FPN.RETURN_SWINT_FEATURE_BEFORE_FUSION = False
+# ---------------------------------------------------------------------------- #
+# BIFPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.BIFPN = CN()
+_C.MODEL.BIFPN.NUM_REPEATS = 1
+_C.MODEL.BIFPN.USE_ATTENTION = True
+
+# ---------------------------------------------------------------------------- #
+# Group Norm options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.GROUP_NORM = CN()
+# Number of dimensions per group in GroupNorm (-1 if using NUM_GROUPS)
+_C.MODEL.GROUP_NORM.DIM_PER_GP = -1
+# Number of groups in GroupNorm (-1 if using DIM_PER_GP)
+_C.MODEL.GROUP_NORM.NUM_GROUPS = 16
+# GroupNorm's small constant in the denominator
+_C.MODEL.GROUP_NORM.EPSILON = 1e-5
+
+# ---------------------------------------------------------------------------- #
+# Evo Norm options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.EVO_NORM = CN()
+# Number of groups in EvoNorm (-1 if using DIM_PER_GP)
+_C.MODEL.EVO_NORM.NUM_GROUPS = 8
+# EvoNorm's small constant in the denominator
+_C.MODEL.EVO_NORM.EPSILON = 1e-5
+
+# ---------------------------------------------------------------------------- #
+# RetinaNet Options (Follow the Detectron version)
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RETINANET = CN()
+# This is the number of foreground classes and background.
+_C.MODEL.RETINANET.NUM_CLASSES = 81
+# Convolutions to use in the cls and bbox tower
+# NOTE: this doesn't include the last conv for logits
+_C.MODEL.RETINANET.NUM_CONVS = 4
+# During inference, #locs to select based on cls score before NMS is performed
+# per FPN level
+_C.MODEL.RETINANET.PRE_NMS_TOP_N = 1000
+# Prior prob for the positives at the beginning of training. This is used to set
+# the bias init for the logits layer
+_C.MODEL.RETINANET.PRIOR_PROB = 0.01
+# Inference cls score threshold, anchors with score > INFERENCE_TH are
+# considered for inference
+_C.MODEL.RETINANET.INFERENCE_TH = 0.05
+# NMS threshold used in RetinaNet
+_C.MODEL.RETINANET.NMS_TH = 0.4
+_C.MODEL.RETINANET.DETECTIONS_PER_IMG = 100
+
+# ---------------------------------------------------------------------------- #
+# Focal Loss Options (Follow the Detectron version)
+# ---------------------------------------------------------------------------- #
+_C.MODEL.FOCAL = CN()
+# Weight for bbox_regression loss
+_C.MODEL.FOCAL.BBOX_REG_WEIGHT = 4.0
+# Smooth L1 loss beta for bbox regression
+_C.MODEL.FOCAL.BBOX_REG_BETA = 0.11
+# IoU overlap ratio for labeling an anchor as positive
+# Anchors with >= iou overlap are labeled positive
+_C.MODEL.FOCAL.FG_IOU_THRESHOLD = 0.5
+# IoU overlap ratio for labeling an anchor as negative
+# Anchors with < iou overlap are labeled negative
+_C.MODEL.FOCAL.BG_IOU_THRESHOLD = 0.4
+# Focal loss parameter: alpha
+_C.MODEL.FOCAL.LOSS_ALPHA = 0.25
+# Focal loss parameter: gamma
+_C.MODEL.FOCAL.LOSS_GAMMA = 2.0
+
+# ---------------------------------------------------------------------------- #
+# FCOS Options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.FCOS = CN()
+_C.MODEL.FCOS.NUM_CLASSES = 81 # the number of classes including background
+_C.MODEL.FCOS.FPN_STRIDES = [8, 16, 32, 64, 128]
+_C.MODEL.FCOS.PRIOR_PROB = 0.01
+_C.MODEL.FCOS.INFERENCE_TH = 0.05
+_C.MODEL.FCOS.NMS_TH = 0.6
+_C.MODEL.FCOS.PRE_NMS_TOP_N = 1000
+
+# the number of convolutions used in the cls and bbox tower
+_C.MODEL.FCOS.NUM_CONVS = 4
+# if use deformable conv to align features
+_C.MODEL.FCOS.USE_DFCONV = False
+
+# if CENTER_SAMPLING_RADIUS <= 0, it will disable center sampling
+_C.MODEL.FCOS.CENTER_SAMPLING_RADIUS = 0.0
+# IOU_LOSS_TYPE can be "iou", "linear_iou" or "giou"
+_C.MODEL.FCOS.IOU_LOSS_TYPE = "iou"
+
+_C.MODEL.FCOS.NORM_REG_TARGETS = False
+_C.MODEL.FCOS.CENTERNESS_ON_REG = False
+_C.MODEL.FCOS.USE_GT_CENTER = False
+
+_C.MODEL.FCOS.DETECTIONS_PER_IMG = 100
+_C.MODEL.FCOS.USE_GN = False
+_C.MODEL.FCOS.USE_BN = False
+
+_C.MODEL.FCOS.INFERENCE_TH_TRAIN = 0.0
+_C.MODEL.FCOS.PRE_NMS_TOP_N_TRAIN = 3000
+_C.MODEL.FCOS.POST_NMS_TOP_N_TRAIN = 1000
+
+# ---------------------------------------------------------------------------- #
+# ATSS Options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ATSS = CN()
+_C.MODEL.ATSS.NUM_CLASSES = 81 # the number of classes including background
+_C.MODEL.ATSS.PRIOR_PROB = 0.01
+_C.MODEL.ATSS.INFERENCE_TH = 0.05
+_C.MODEL.ATSS.NMS_TH = 0.6
+_C.MODEL.ATSS.PRE_NMS_TOP_N = 1000
+
+# the number of convolutions used in the cls and bbox tower
+_C.MODEL.ATSS.NUM_CONVS = 4
+# the channels of convolutions used in the cls and bbox tower
+_C.MODEL.ATSS.CHANNELS = 128
+# if use deformable conv to align features
+_C.MODEL.ATSS.USE_DFCONV = False
+
+# topk for selecting candidate positive samples from each level
+_C.MODEL.ATSS.TOPK = 9
+
+# Weight for bbox_regression loss
+_C.MODEL.ATSS.REG_LOSS_WEIGHT = 2.0
+
+_C.MODEL.ATSS.DETECTIONS_PER_IMG = 100
+_C.MODEL.ATSS.USE_GN = False
+_C.MODEL.ATSS.USE_BN = False
+
+_C.MODEL.ATSS.USE_DYRELU = False
+_C.MODEL.ATSS.USE_SE = False
+
+_C.MODEL.ATSS.INFERENCE_TH_TRAIN = 0.0
+_C.MODEL.ATSS.PRE_NMS_TOP_N_TRAIN = 3000
+_C.MODEL.ATSS.POST_NMS_TOP_N_TRAIN = 1000
+# ---------------------------------------------------------------------------- #
+# DYHEAD Options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.DYHEAD = CN()
+_C.MODEL.DYHEAD.NUM_CLASSES = 81 # the number of classes including background
+_C.MODEL.DYHEAD.PRIOR_PROB = 0.01
+
+# the number of convolutions used in the cls and bbox tower
+_C.MODEL.DYHEAD.NUM_CONVS = 4
+# the channels of convolutions used in the cls and bbox tower
+_C.MODEL.DYHEAD.CHANNELS = 128
+_C.MODEL.DYHEAD.GROUPS = 1
+# if use deformable conv to align features
+_C.MODEL.DYHEAD.USE_DFCONV = False
+
+# topk for selecting candidate positive samples from each level
+_C.MODEL.DYHEAD.TOPK = 9
+
+_C.MODEL.DYHEAD.SCORE_AGG = "MEAN" # MEAN or MAX, for binary focal loss score aggregation
+
+_C.MODEL.DYHEAD.LOG_SCALE = 0.0 # temperature (dot product)
+_C.MODEL.DYHEAD.SHALLOW_LOG_SCALE = 0.0 # # temperature (shallow contrastive)
+
+_C.MODEL.DYHEAD.USE_GN = False
+_C.MODEL.DYHEAD.USE_NSYNCBN = False
+_C.MODEL.DYHEAD.USE_SYNCBN = False
+
+_C.MODEL.DYHEAD.USE_DYFUSE = False
+_C.MODEL.DYHEAD.USE_DYRELU = False
+
+_C.MODEL.DYHEAD.CONV_FUNC = ''
+
+# CosineSimOutputLayers: https://github.com/ucbdrive/few-shot-object-detection/blob/master/fsdet/modeling/roi_heads/fast_rcnn.py#L448-L464
+_C.MODEL.DYHEAD.COSINE_SCALE = -1.0
+
+_C.MODEL.DYHEAD.FUSE_CONFIG = CN()
+_C.MODEL.DYHEAD.FUSE_CONFIG.EARLY_FUSE_ON = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.TYPE = ""
+_C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_EMB_SIZE = 256
+_C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_OUT_SIZE = 256
+_C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_EMB_DROPOUT = 0.1
+_C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_MLP_LAYERS = 2
+
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_CLASSIFICATION_LOSS = False
+
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_TOKEN_LOSS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.TOKEN_LOSS_WEIGHT = 1.0
+_C.MODEL.DYHEAD.FUSE_CONFIG.TOKEN_GAMMA = 2.0
+_C.MODEL.DYHEAD.FUSE_CONFIG.TOKEN_ALPHA = 0.25
+
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_DOT_PRODUCT_TOKEN_LOSS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_CONTRASTIVE_ALIGN_LOSS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.CONTRASTIVE_HIDDEN_DIM = 64
+_C.MODEL.DYHEAD.FUSE_CONFIG.CONTRASTIVE_ALIGN_LOSS_WEIGHT = 1.0
+_C.MODEL.DYHEAD.FUSE_CONFIG.DOT_PRODUCT_TOKEN_LOSS_WEIGHT = 1.0
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_LAYER_SCALE = True
+_C.MODEL.DYHEAD.FUSE_CONFIG.SEPARATE_BIDIRECTIONAL = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.STABLE_SOFTMAX_2D = False
+
+_C.MODEL.DYHEAD.FUSE_CONFIG.DO_LANG_PROJ_OUTSIDE_CHECKPOINT = False
+
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_FUSED_FEATURES_DOT_PRODUCT = False
+
+# Controls for
+_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_MIN_FOR_UNDERFLOW = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_MAX_FOR_OVERFLOW = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_BERTATTN_MIN_FOR_UNDERFLOW = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_BERTATTN_MAX_FOR_OVERFLOW = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_DOT_PRODUCT = False
+
+# MLM Loss
+_C.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS_FOR_ONLY_POSITIVES = True
+_C.MODEL.DYHEAD.FUSE_CONFIG.NO_MASK_FOR_OD = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.NO_MASK_FOR_GOLD = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS_COEF = 1.0
+_C.MODEL.DYHEAD.FUSE_CONFIG.MLM_OBJ_FOR_ONLY_POSITIVE = False
+
+# Shallow Contrastive Loss (FPN)
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_SHALLOW_CONTRASTIVE_LOSS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.SHALLOW_MAX_POSITIVE_ANCHORS = 100
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_SHALLOW_ZERO_PADS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.SHALLOW_CONTRASTIVE_HIDDEN_DIM = 64
+_C.MODEL.DYHEAD.FUSE_CONFIG.SHALLOW_CONTRASTIVE_LOSS_WEIGHT = 1.0
+
+# Shallow Contrastive Loss (BACKBONE)
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_BACKBONE_SHALLOW_CONTRASTIVE_LOSS = False
+
+_C.MODEL.DYHEAD.FUSE_CONFIG.ADD_LINEAR_LAYER = False
+
+# use checkpoint to save memory
+_C.MODEL.DYHEAD.USE_CHECKPOINT = False
+
+# ---------------------------------------------------------------------------- #
+# RPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RPN = CN()
+_C.MODEL.RPN.USE_FPN = False
+# Base RPN anchor sizes given in absolute pixels w.r.t. the scaled network input
+_C.MODEL.RPN.ANCHOR_SIZES = (32, 64, 128, 256, 512)
+# Stride of the feature map that RPN is attached.
+# For FPN, number of strides should match number of scales
+_C.MODEL.RPN.ANCHOR_STRIDE = (16,)
+# RPN anchor aspect ratios
+_C.MODEL.RPN.ASPECT_RATIOS = (0.5, 1.0, 2.0)
+# Anchor shift away ration from the center for r,t,l,d
+_C.MODEL.RPN.ANCHOR_SHIFT = (0.0, 0.0, 0.0, 0.0)
+# Use center to decide anchor size
+_C.MODEL.RPN.USE_RELATIVE_SIZE = False
+# Remove RPN anchors that go outside the image by RPN_STRADDLE_THRESH pixels
+# Set to -1 or a large value, e.g. 100000, to disable pruning anchors
+_C.MODEL.RPN.STRADDLE_THRESH = 0
+# Anchor scales per octave for complex anchors
+_C.MODEL.RPN.OCTAVE = 2.0
+_C.MODEL.RPN.SCALES_PER_OCTAVE = 3
+# Minimum overlap required between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD
+# ==> positive RPN example)
+_C.MODEL.RPN.FG_IOU_THRESHOLD = 0.7
+# Maximum overlap allowed between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD
+# ==> negative RPN example)
+_C.MODEL.RPN.BG_IOU_THRESHOLD = 0.3
+# Total number of RPN examples per image
+_C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256
+# Target fraction of foreground (positive) examples per RPN minibatch
+_C.MODEL.RPN.POSITIVE_FRACTION = 0.5
+# Number of top scoring RPN proposals to keep before applying NMS
+# When FPN is used, this is *per FPN level* (not total)
+_C.MODEL.RPN.PRE_NMS_TOP_N_TRAIN = 12000
+_C.MODEL.RPN.PRE_NMS_TOP_N_TEST = 6000
+# Number of top scoring RPN proposals to keep after applying NMS
+_C.MODEL.RPN.POST_NMS_TOP_N_TRAIN = 2000
+_C.MODEL.RPN.POST_NMS_TOP_N_TEST = 1000
+# NMS threshold used on RPN proposals
+_C.MODEL.RPN.NMS_THRESH = 0.7
+# Proposal height and width both need to be greater than RPN_MIN_SIZE
+# (a the scale used during training or inference)
+_C.MODEL.RPN.MIN_SIZE = 0
+# Number of top scoring RPN proposals to keep after combining proposals from
+# all FPN levels
+_C.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN = 2000
+_C.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 2000
+# Custom rpn head, empty to use default conv or separable conv
+_C.MODEL.RPN.RPN_HEAD = "SingleConvRPNHead"
+_C.MODEL.RPN.FREEZE = False
+_C.MODEL.RPN.FORCE_BOXES = False
+_C.MODEL.RPN.RETURN_FUSED_FEATURES = False
+
+# ---------------------------------------------------------------------------- #
+# ROI HEADS options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_HEADS = CN()
+_C.MODEL.ROI_HEADS.USE_FPN = False
+# Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD)
+_C.MODEL.ROI_HEADS.FG_IOU_THRESHOLD = 0.5
+# Overlap threshold for an RoI to be considered background
+# (class = 0 if overlap in [0, BG_IOU_THRESHOLD))
+_C.MODEL.ROI_HEADS.BG_IOU_THRESHOLD = 0.5
+# Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets
+# These are empirically chosen to approximately lead to unit variance targets
+_C.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS = (10., 10., 5., 5.)
+# RoI minibatch size *per image* (number of regions of interest [ROIs])
+# Total number of RoIs per training minibatch =
+# TRAIN.BATCH_SIZE_PER_IM * TRAIN.IMS_PER_BATCH * NUM_GPUS
+# E.g., a common configuration is: 512 * 2 * 8 = 8192
+_C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
+# Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0)
+_C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25
+
+# Only used on test mode
+
+# Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to
+# balance obtaining high recall with not having too many low precision
+# detections that will slow down inference post processing steps (like NMS)
+_C.MODEL.ROI_HEADS.SCORE_THRESH = 0.05
+# Overlap threshold used for non-maximum suppression (suppress boxes with
+# IoU >= this threshold)
+_C.MODEL.ROI_HEADS.NMS = 0.5
+# Maximum number of detections to return per image (100 is based on the limit
+# established for the COCO dataset)
+_C.MODEL.ROI_HEADS.DETECTIONS_PER_IMG = 100
+
+_C.MODEL.ROI_BOX_HEAD = CN()
+_C.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor"
+_C.MODEL.ROI_BOX_HEAD.PREDICTOR = "FastRCNNPredictor"
+_C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_BOX_HEAD.POOLER_SCALES = (1.0 / 16,)
+_C.MODEL.ROI_BOX_HEAD.NUM_CLASSES = 81
+# Hidden layer dimension when using an MLP for the RoI box head
+_C.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM = 1024
+# GN
+_C.MODEL.ROI_BOX_HEAD.USE_GN = False
+# Dilation
+_C.MODEL.ROI_BOX_HEAD.DILATION = 1
+_C.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM = 256
+_C.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS = 4
+# Use D2 style ROIAlignV2
+_C.MODEL.ROI_BOX_HEAD.POOLER_ALIGNED = False
+
+_C.MODEL.ROI_MASK_HEAD = CN()
+_C.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor"
+_C.MODEL.ROI_MASK_HEAD.PREDICTOR = "MaskRCNNC4Predictor"
+_C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_MASK_HEAD.POOLER_SCALES = (1.0 / 16,)
+_C.MODEL.ROI_MASK_HEAD.MLP_HEAD_DIM = 1024
+_C.MODEL.ROI_MASK_HEAD.CONV_LAYERS = (256, 256, 256, 256)
+_C.MODEL.ROI_MASK_HEAD.RESOLUTION = 14
+_C.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True
+# Whether or not resize and translate masks to the input image.
+_C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS = False
+_C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD = 0.5
+# Dilation
+_C.MODEL.ROI_MASK_HEAD.DILATION = 1
+# GN
+_C.MODEL.ROI_MASK_HEAD.USE_GN = False
+# HG
+_C.MODEL.ROI_MASK_HEAD.HG_SCALE = 1
+
+_C.MODEL.ROI_KEYPOINT_HEAD = CN()
+_C.MODEL.ROI_KEYPOINT_HEAD.FEATURE_EXTRACTOR = "KeypointRCNNFeatureExtractor"
+_C.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR = "KeypointRCNNPredictor"
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SCALES = (1.0 / 16,)
+_C.MODEL.ROI_KEYPOINT_HEAD.MLP_HEAD_DIM = 1024
+_C.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS = tuple(512 for _ in range(8))
+_C.MODEL.ROI_KEYPOINT_HEAD.RESOLUTION = 14
+_C.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES = 17
+_C.MODEL.ROI_KEYPOINT_HEAD.KEYPOINT_NAME = () # If left empty, use default names
+_C.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True
+
+# ---------------------------------------------------------------------------- #
+# ResNe[X]t options (ResNets = {ResNet, ResNeXt}
+# Note that parts of a resnet may be used for both the backbone and the head
+# These options apply to both
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RESNETS = CN()
+
+_C.MODEL.RESNETS.USE_STEM3X3 = False
+_C.MODEL.RESNETS.WITH_SE = False
+_C.MODEL.RESNETS.USE_AVG_DOWN = False
+
+# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
+_C.MODEL.RESNETS.NUM_GROUPS = 1
+
+# Baseline width of each group
+_C.MODEL.RESNETS.WIDTH_PER_GROUP = 64
+
+# Place the stride 2 conv on the 1x1 filter
+# Use True only for the original MSRA ResNet; use False for C2 and Torch models
+_C.MODEL.RESNETS.STRIDE_IN_1X1 = True
+
+# Residual transformation function
+_C.MODEL.RESNETS.TRANS_FUNC = "BottleneckWithFixedBatchNorm"
+# ResNet's stem function (conv1 and pool1)
+_C.MODEL.RESNETS.STEM_FUNC = "StemWithFixedBatchNorm"
+
+# Apply dilation in stage "res5"
+_C.MODEL.RESNETS.RES5_DILATION = 1
+
+_C.MODEL.RESNETS.BACKBONE_OUT_CHANNELS = 256 * 4
+_C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
+_C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64
+
+_C.MODEL.RESNETS.REVISION = "resnet_light"
+# Deformable convolutions
+_C.MODEL.RESNETS.STAGE_WITH_DCN = (False, False, False, False)
+_C.MODEL.RESNETS.WITH_MODULATED_DCN = False
+_C.MODEL.RESNETS.DEFORMABLE_GROUPS = 1
+
+# ---------------------------------------------------------------------------- #
+# Swin Transformer
+# ---------------------------------------------------------------------------- #
+_C.MODEL.SWINT = CN()
+_C.MODEL.SWINT.EMBED_DIM = 96
+_C.MODEL.SWINT.OUT_CHANNELS = (96, 192, 384, 768)
+_C.MODEL.SWINT.DEPTHS = (2, 2, 6, 2)
+_C.MODEL.SWINT.NUM_HEADS = (3, 6, 12, 24)
+_C.MODEL.SWINT.WINDOW_SIZE = 7
+_C.MODEL.SWINT.MLP_RATIO = 4
+_C.MODEL.SWINT.DROP_PATH_RATE = 0.2
+_C.MODEL.SWINT.APE = False
+_C.MODEL.SWINT.VERSION = "v1"
+_C.MODEL.SWINT.OUT_NORM = True
+_C.MODEL.SWINT.LAYER_SCALE = 0
+
+# ---------------------------------------------------------------------------- #
+# CVT SPEC
+# ---------------------------------------------------------------------------- #
+_C.MODEL.SPEC = CN(new_allowed=True)
+
+# ---------------------------------------------------------------------------- #
+# CLIP SPEC
+# ---------------------------------------------------------------------------- #
+_C.MODEL.CLIP = CN()
+_C.MODEL.CLIP.CONTEXT_LENGTH = 256 # default 77
+_C.MODEL.CLIP.WIDTH = 512
+_C.MODEL.CLIP.LAYERS = 12
+_C.MODEL.CLIP.HEADS = 8
+_C.MODEL.CLIP.DROP_PATH = 0.0
+_C.MODEL.CLIP.TOKENIZER = "clip"
+_C.MODEL.CLIP.VOCAB_SIZE = 49408
+
+# ---------------------------------------------------------------------------- #
+# SEARCH
+# ---------------------------------------------------------------------------- #
+
+_C.SEARCH = CN()
+_C.SEARCH.MAX_EPOCH = 20
+_C.SEARCH.SELECT_NUM = 20
+_C.SEARCH.POPULATION_NUM = 64
+_C.SEARCH.MUTATION_NUM = 24
+_C.SEARCH.CROSSOVER_NUM = 24
+_C.SEARCH.MUTATION_PROB = 0.1
+
+# ---------------------------------------------------------------------------- #
+# Solver
+# ---------------------------------------------------------------------------- #
+_C.SOLVER = CN()
+_C.SOLVER.USE_AMP = False
+
+_C.SOLVER.MAX_ITER = 40000
+_C.SOLVER.MULTI_MAX_ITER = () # set different max epoch for different stage
+_C.SOLVER.MAX_EPOCH = 0 # any epoch number>0 will overwrite max_iter
+_C.SOLVER.MULTI_MAX_EPOCH = () # set different max epoch for different stage
+
+_C.SOLVER.OPTIMIZER = "SGD" # "ADAMW"
+
+_C.SOLVER.BASE_LR = 0.001
+
+_C.SOLVER.LANG_LR = 0.00001
+_C.SOLVER.BACKBONE_BODY_LR_FACTOR = 1.0
+
+_C.SOLVER.BIAS_LR_FACTOR = 2
+_C.SOLVER.GRAD_CLIP = 0.0
+# D2 gradient clip
+_C.SOLVER.CLIP_GRADIENTS = CN()
+_C.SOLVER.CLIP_GRADIENTS.ENABLED = False
+_C.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 0.0
+_C.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "full_model"
+_C.SOLVER.CLIP_GRADIENTS.NORM_TYPE = 2.0
+_C.SOLVER.MODEL_EMA = 0.0
+
+_C.SOLVER.MOMENTUM = 0.9
+
+_C.SOLVER.WEIGHT_DECAY = 0.0005
+_C.SOLVER.WEIGHT_DECAY_BIAS = 0.0
+_C.SOLVER.WEIGHT_DECAY_NORM_FACTOR = 1.0
+
+# use cosine lr to replace default multistage
+_C.SOLVER.USE_COSINE = False
+_C.SOLVER.MIN_LR = 0.000001
+
+_C.SOLVER.GAMMA = 0.1
+_C.SOLVER.STEPS = (30000,)
+
+_C.SOLVER.USE_AUTOSTEP = False
+_C.SOLVER.STEP_PATIENCE = 5
+
+_C.SOLVER.WARMUP_FACTOR = 1.0 / 3
+_C.SOLVER.WARMUP_ITERS = 500
+_C.SOLVER.WARMUP_METHOD = "linear"
+
+_C.SOLVER.CHECKPOINT_PERIOD = 2500
+_C.SOLVER.CHECKPOINT_PER_EPOCH = -1.0
+_C.SOLVER.TEST_WITH_INFERENCE = False
+_C.SOLVER.AUTO_TERMINATE_PATIENCE = -1
+# Number of images per batch
+# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
+# see 2 images per batch
+_C.SOLVER.IMS_PER_BATCH = 16
+# This is the max negative ratio allowed per batch
+_C.SOLVER.MAX_NEG_PER_BATCH = 0.1
+
+_C.SOLVER.SEED = 0
+_C.SOLVER.DISABLE_OUTPUT_DISTRIBUTED = False
+
+
+_C.SOLVER.PROMPT_PROBING_LEVEL = -1.0
+# -1 means tuning the whole model;
+# 1 means tuning the whole language model; 1.5 means tuning the box head as well
+
+_C.SOLVER.FIND_UNUSED_PARAMETERS = True
+_C.SOLVER.DATASET_LENGTH = -1 # Just for logging purpose
+_C.SOLVER.TUNING_HIGHLEVEL_OVERRIDE = None
+_C.SOLVER.USE_EMA_FOR_MONITOR = False
+
+_C.SOLVER.WEIGHT_DECAY_SCHEDULE = False
+_C.SOLVER.WEIGHT_DECAY_SCHEDULE_RATIO = 0.667
+
+# ---------------------------------------------------------------------------- #
+# Specific test options
+# ---------------------------------------------------------------------------- #
+_C.TEST = CN()
+_C.TEST.EXPECTED_RESULTS = []
+_C.TEST.EXPECTED_RESULTS_SIGMA_TOL = 4
+_C.TEST.DURING_TRAINING = False
+# Number of images per batch
+# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
+# see 2 images per batch
+_C.TEST.IMS_PER_BATCH = 16
+# Special Test Configuration
+_C.TEST.USE_MULTISCALE = False
+# _C.TEST.SCALES = (400, 600, 800, 1000, 1200, 1400)
+# _C.TEST.RANGES = ((96, 10000), (64, 10000), (0, 10000), (0, 10000), (0, 256), (0, 192))
+_C.TEST.SCALES = (400, 500, 600, 640, 700, 900, 1000, 1100, 1200, 1300, 1400, 1800)
+_C.TEST.RANGES = ((96, 10000), (96, 10000), (64, 10000), (64, 10000), (64, 10000), (0, 10000), (0, 10000), (0, 256), (0, 256), (0, 192), (0, 192), (0, 96))
+_C.TEST.MAX_SIZE = 2500
+_C.TEST.FLIP = True
+_C.TEST.SPECIAL_NMS = 'none' # ('none', 'soft-nms', 'vote', 'soft-vote')
+_C.TEST.TH = 0.6 # threshold for nms or vote
+_C.TEST.PRE_NMS_TOP_N = 1000
+_C.TEST.NUM_CLASSES = 81
+_C.TEST.SELECT_CLASSES = ()
+
+_C.TEST.EVAL_TASK = ""
+_C.TEST.SUBSET = -1
+_C.TEST.CHUNKED_EVALUATION = -1
+_C.TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM = -1
+# ---------------------------------------------------------------------------- #
+# Misc options
+# ---------------------------------------------------------------------------- #
+_C.OUTPUT_DIR = "OUTPUT"
+
+_C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py")
+
+# TensorBoard experiment location
+_C.TENSORBOARD_EXP = "OUTPUT"
+
+
+_C.GLIPKNOW = CN()
+_C.GLIPKNOW.KNOWLEDGE_FILE = ""
+_C.GLIPKNOW.KNOWLEDGE_TYPE = ""
+_C.GLIPKNOW.MAX_NUM_CLASSES_PER_BATCH_TRAIN = -1
+_C.GLIPKNOW.PARALLEL_LANGUAGE_INPUT = False
+_C.GLIPKNOW.LAN_FEATURE_AGG_TYPE = "first"
+_C.GLIPKNOW.GPT3_NUM = 5
+_C.GLIPKNOW.WIKI_AND_GPT3 = False
\ No newline at end of file
diff --git a/maskrcnn_benchmark/config/paths_catalog.py b/maskrcnn_benchmark/config/paths_catalog.py
new file mode 100644
index 0000000000000000000000000000000000000000..be63e5715434d696cb1480c8a5b436b642808afb
--- /dev/null
+++ b/maskrcnn_benchmark/config/paths_catalog.py
@@ -0,0 +1,447 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""Centralized catalog of paths."""
+
+import os
+
+
+def try_to_find(file, return_dir=False, search_path=['./DATASET', './OUTPUT', './data', './MODEL']):
+ if not file:
+ return file
+
+ if file.startswith('catalog://'):
+ return file
+
+ DATASET_PATH = ['./']
+ if 'DATASET' in os.environ:
+ DATASET_PATH.append(os.environ['DATASET'])
+ DATASET_PATH += search_path
+
+ for path in DATASET_PATH:
+ if os.path.exists(os.path.join(path, file)):
+ if return_dir:
+ return path
+ else:
+ return os.path.join(path, file)
+
+ print('Cannot find {} in {}'.format(file, DATASET_PATH))
+ exit(1)
+
+
+class DatasetCatalog(object):
+ DATASETS = {
+ # pretrained grounding dataset
+ # mixed vg and coco
+ "mixed_train": {
+ "coco_img_dir": "coco/train2014",
+ "vg_img_dir": "gqa/images",
+ "ann_file": "mdetr_annotations/final_mixed_train.json",
+ },
+ "mixed_train_no_coco": {
+ "coco_img_dir": "coco/train2014",
+ "vg_img_dir": "gqa/images",
+ "ann_file": "mdetr_annotations/final_mixed_train_no_coco.json",
+ },
+
+ # flickr30k
+ "flickr30k_train": {
+ "img_folder": "flickr30k/flickr30k_images/train",
+ "ann_file": "mdetr_annotations/final_flickr_separateGT_train.json",
+ "is_train": True
+ },
+ "flickr30k_val": {
+ "img_folder": "flickr30k/flickr30k_images/val",
+ "ann_file": "mdetr_annotations/final_flickr_separateGT_val.json",
+ "is_train": False
+ },
+ "flickr30k_test": {
+ "img_folder": "flickr30k/flickr30k_images/test",
+ "ann_file": "mdetr_annotations/final_flickr_separateGT_test.json",
+ "is_train": False
+ },
+
+ # refcoco
+ "refexp_all_val": {
+ "img_dir": "refcoco/train2014",
+ "ann_file": "mdetr_annotations/final_refexp_val.json",
+ "is_train": False
+ },
+
+ # gqa
+ "gqa_val": {
+ "img_dir": "gqa/images",
+ "ann_file": "mdetr_annotations/final_gqa_val.json",
+ "is_train": False
+ },
+
+ # phrasecut
+ "phrasecut_train": {
+ "img_dir": "gqa/images",
+ "ann_file": "mdetr_annotations/finetune_phrasecut_train.json",
+ "is_train": True
+ },
+
+
+ # od to grounding
+ # coco tsv
+ "coco_dt_train": {
+ "dataset_file": "coco_dt",
+ "yaml_path": "coco_tsv/coco_obj.yaml",
+ "is_train": True,
+ },
+ "COCO_odinw_train_8copy_dt_train": {
+ "dataset_file": "coco_odinw_dt",
+ "yaml_path": "coco_tsv/COCO_odinw_train_8copy.yaml",
+ "is_train": True,
+ },
+ "COCO_odinw_val_dt_train": {
+ "dataset_file": "coco_odinw_dt",
+ "yaml_path": "coco_tsv/COCO_odinw_val.yaml",
+ "is_train": False,
+ },
+ # lvis tsv
+ "lvisv1_dt_train": {
+ "dataset_file": "lvisv1_dt",
+ "yaml_path": "coco_tsv/LVIS_v1_train.yaml",
+ "is_train": True,
+ },
+ "LVIS_odinw_train_8copy_dt_train": {
+ "dataset_file": "coco_odinw_dt",
+ "yaml_path": "coco_tsv/LVIS_odinw_train_8copy.yaml",
+ "is_train": True,
+ },
+ # object365 tsv
+ "object365_dt_train": {
+ "dataset_file": "object365_dt",
+ "yaml_path": "Objects365/objects365_train_vgoiv6.cas2000.yaml",
+ "is_train": True,
+ },
+ "object365_odinw_2copy_dt_train": {
+ "dataset_file": "object365_odinw_dt",
+ "yaml_path": "Objects365/objects365_train_odinw.cas2000_2copy.yaml",
+ "is_train": True,
+ },
+ "objects365_odtsv_train": {
+ "dataset_file": "objects365_odtsv",
+ "yaml_path": "Objects365/train.cas2000.yaml",
+ "is_train": True,
+ },
+ "objects365_odtsv_val": {
+ "dataset_file": "objects365_odtsv",
+ "yaml_path": "Objects365/val.yaml",
+ "is_train": False,
+ },
+
+ # ImagetNet OD
+ "imagenetod_train_odinw_2copy_dt": {
+ "dataset_file": "imagenetod_odinw_dt",
+ "yaml_path": "imagenet_od/imagenetod_train_odinw_2copy.yaml",
+ "is_train": True,
+ },
+
+ # OpenImage OD
+ "oi_train_odinw_dt": {
+ "dataset_file": "oi_odinw_dt",
+ "yaml_path": "openimages_v5c/oi_train_odinw.cas.2000.yaml",
+ "is_train": True,
+ },
+
+ # vg tsv
+ "vg_dt_train": {
+ "dataset_file": "vg_dt",
+ "yaml_path": "visualgenome/train_vgoi6_clipped.yaml",
+ "is_train": True,
+ },
+
+ "vg_odinw_clipped_8copy_dt_train": {
+ "dataset_file": "vg_odinw_clipped_8copy_dt",
+ "yaml_path": "visualgenome/train_odinw_clipped_8copy.yaml",
+ "is_train": True,
+ },
+ "vg_vgoi6_clipped_8copy_dt_train": {
+ "dataset_file": "vg_vgoi6_clipped_8copy_dt",
+ "yaml_path": "visualgenome/train_vgoi6_clipped_8copy.yaml",
+ "is_train": True,
+ },
+
+ # coco json
+ "coco_grounding_train": {
+ "img_dir": "coco/train2017",
+ "ann_file": "coco/annotations/instances_train2017.json",
+ "is_train": True,
+ },
+
+ "lvis_grounding_train": {
+ "img_dir": "coco",
+ "ann_file": "coco/annotations/lvis_od_train.json"
+ },
+
+
+ "lvis_val": {
+ "img_dir": "coco",
+ "ann_file": "coco/annotations/lvis_od_val.json"
+ },
+ "coco_2017_train": {
+ "img_dir": "coco/train2017",
+ "ann_file": "coco/annotations/instances_train2017.json"
+ },
+ "coco_2017_val": {
+ "img_dir": "coco/val2017",
+ "ann_file": "coco/annotations/instances_val2017.json"
+ },
+ "coco_2017_test": {
+ "img_dir": "coco/test2017",
+ "ann_file": "coco/annotations/image_info_test-dev2017.json"
+ },
+ "coco_2014_train": {
+ "img_dir": "coco/train2014",
+ "ann_file": "coco/annotations/instances_train2014.json"
+ },
+ "coco_2014_val": {
+ "img_dir": "coco/val2014",
+ "ann_file": "coco/annotations/instances_val2014.json"
+ },
+ "coco_2014_minival": {
+ "img_dir": "coco/val2014",
+ "ann_file": "coco/annotations/instances_minival2014.json"
+ },
+ }
+
+ @staticmethod
+ def set(name, info):
+ DatasetCatalog.DATASETS.update({name: info})
+
+ @staticmethod
+ def get(name):
+
+ if name.endswith('_bg'):
+ attrs = DatasetCatalog.DATASETS[name]
+ data_dir = try_to_find(attrs["ann_file"], return_dir=True)
+ args = dict(
+ root=os.path.join(data_dir, attrs["img_dir"]),
+ ann_file=os.path.join(data_dir, attrs["ann_file"]),
+ )
+ return dict(
+ factory="Background",
+ args=args,
+ )
+ else:
+ if "bing" in name.split("_"):
+ attrs = DatasetCatalog.DATASETS["bing_caption_train"]
+ else:
+ attrs = DatasetCatalog.DATASETS[name]
+
+ if "voc" in name and 'split' in attrs:
+ data_dir = try_to_find(attrs["data_dir"], return_dir=True)
+ args = dict(
+ data_dir=os.path.join(data_dir, attrs["data_dir"]),
+ split=attrs["split"],
+ )
+ return dict(
+ factory="PascalVOCDataset",
+ args=args,
+ )
+ elif "mixed" in name:
+ vg_img_dir = try_to_find(attrs["vg_img_dir"], return_dir=True)
+ coco_img_dir = try_to_find(attrs["coco_img_dir"], return_dir=True)
+ ann_file = try_to_find(attrs["ann_file"], return_dir=True)
+ args = dict(
+ img_folder_coco=os.path.join(coco_img_dir, attrs["coco_img_dir"]),
+ img_folder_vg=os.path.join(vg_img_dir, attrs["vg_img_dir"]),
+ ann_file=os.path.join(ann_file, attrs["ann_file"])
+ )
+ return dict(
+ factory="MixedDataset",
+ args=args,
+ )
+ elif "flickr" in name:
+ img_dir = try_to_find(attrs["img_folder"], return_dir=True)
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+ args = dict(
+ img_folder=os.path.join(img_dir, attrs["img_folder"]),
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+ is_train=attrs["is_train"]
+ )
+ return dict(
+ factory="FlickrDataset",
+ args=args,
+ )
+ elif "refexp" in name:
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+ args = dict(
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+ )
+ return dict(
+ factory="RefExpDataset",
+ args=args,
+ )
+ elif "gqa" in name:
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+ args = dict(
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+ )
+ return dict(
+ factory="GQADataset",
+ args=args,
+ )
+ elif "phrasecut" in name:
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+ args = dict(
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+ )
+ return dict(
+ factory="PhrasecutDetection",
+ args=args,
+ )
+ elif "_caption" in name:
+ yaml_path = try_to_find(attrs["yaml_path"], return_dir=True)
+ if "no_coco" in name:
+ yaml_name = attrs["yaml_name_no_coco"]
+ else:
+ yaml_name = attrs["yaml_name"]
+ yaml_file_name = "{}.{}.yaml".format(yaml_name, name.split("_")[2])
+ args = dict(
+ yaml_file=os.path.join(yaml_path, attrs["yaml_path"], yaml_file_name)
+ )
+ return dict(
+ factory="CaptionTSV",
+ args=args,
+ )
+ elif "inferencecap" in name:
+ yaml_file_name = try_to_find(attrs["yaml_path"])
+ args = dict(
+ yaml_file=yaml_file_name)
+ return dict(
+ factory="CaptionTSV",
+ args=args,
+ )
+ elif "pseudo_data" in name:
+ args = dict(
+ yaml_file=try_to_find(attrs["yaml_path"])
+ )
+ return dict(
+ factory="PseudoData",
+ args=args,
+ )
+ elif "_dt" in name:
+ dataset_file = attrs["dataset_file"]
+ yaml_path = try_to_find(attrs["yaml_path"], return_dir=True)
+ args = dict(
+ name=dataset_file,
+ yaml_file=os.path.join(yaml_path, attrs["yaml_path"]),
+ )
+ return dict(
+ factory="CocoDetectionTSV",
+ args=args,
+ )
+ elif "_odtsv" in name:
+ dataset_file = attrs["dataset_file"]
+ yaml_path = try_to_find(attrs["yaml_path"], return_dir=True)
+ args = dict(
+ name=dataset_file,
+ yaml_file=os.path.join(yaml_path, attrs["yaml_path"]),
+ )
+ return dict(
+ factory="ODTSVDataset",
+ args=args,
+ )
+ elif "_grounding" in name:
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+ args = dict(
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+ )
+ return dict(
+ factory="CocoGrounding",
+ args=args,
+ )
+ elif "lvis_evaluation" in name:
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+ args = dict(
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+ )
+ return dict(
+ factory="LvisDetection",
+ args=args,
+ )
+ else:
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+ args = dict(
+ root=os.path.join(img_dir, attrs["img_dir"]),
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+ )
+ for k, v in attrs.items():
+ args.update({k: os.path.join(ann_dir, v)})
+ return dict(
+ factory="COCODataset",
+ args=args,
+ )
+
+ raise RuntimeError("Dataset not available: {}".format(name))
+
+
+class ModelCatalog(object):
+ S3_C2_DETECTRON_URL = "https://dl.fbaipublicfiles.com/detectron"
+ C2_IMAGENET_MODELS = {
+ "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl",
+ "MSRA/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl",
+ "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl",
+ "MSRA/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl",
+ "FAIR/20171220/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl",
+ "FAIR/20171220/X-101-64x4d": "ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl",
+ }
+
+ C2_DETECTRON_SUFFIX = "output/train/coco_2014_train%3Acoco_2014_valminusminival/generalized_rcnn/model_final.pkl"
+ C2_DETECTRON_MODELS = {
+ "35857197/e2e_faster_rcnn_R-50-C4_1x": "01_33_49.iAX0mXvW",
+ "35857345/e2e_faster_rcnn_R-50-FPN_1x": "01_36_30.cUF7QR7I",
+ "35857890/e2e_faster_rcnn_R-101-FPN_1x": "01_38_50.sNxI7sX7",
+ "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "06_31_39.5MIHi1fZ",
+ "35858791/e2e_mask_rcnn_R-50-C4_1x": "01_45_57.ZgkA7hPB",
+ "35858933/e2e_mask_rcnn_R-50-FPN_1x": "01_48_14.DzEQe4wC",
+ "35861795/e2e_mask_rcnn_R-101-FPN_1x": "02_31_37.KqyEK4tT",
+ "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "06_35_59.RZotkLKI",
+ }
+
+ @staticmethod
+ def get(name):
+ if name.startswith("Caffe2Detectron/COCO"):
+ return ModelCatalog.get_c2_detectron_12_2017_baselines(name)
+ if name.startswith("ImageNetPretrained"):
+ return ModelCatalog.get_c2_imagenet_pretrained(name)
+ raise RuntimeError("model not present in the catalog {}".format(name))
+
+ @staticmethod
+ def get_c2_imagenet_pretrained(name):
+ prefix = ModelCatalog.S3_C2_DETECTRON_URL
+ name = name[len("ImageNetPretrained/"):]
+ name = ModelCatalog.C2_IMAGENET_MODELS[name]
+ url = "/".join([prefix, name])
+ return url
+
+ @staticmethod
+ def get_c2_detectron_12_2017_baselines(name):
+ # Detectron C2 models are stored following the structure
+ # prefix//2012_2017_baselines/.yaml./suffix
+ # we use as identifiers in the catalog Caffe2Detectron/COCO//
+ prefix = ModelCatalog.S3_C2_DETECTRON_URL
+ suffix = ModelCatalog.C2_DETECTRON_SUFFIX
+ # remove identification prefix
+ name = name[len("Caffe2Detectron/COCO/"):]
+ # split in and
+ model_id, model_name = name.split("/")
+ # parsing to make it match the url address from the Caffe2 models
+ model_name = "{}.yaml".format(model_name)
+ signature = ModelCatalog.C2_DETECTRON_MODELS[name]
+ unique_name = ".".join([model_name, signature])
+ url = "/".join([prefix, model_id, "12_2017_baselines", unique_name, suffix])
+ return url
diff --git a/maskrcnn_benchmark/csrc/ROIAlign.h b/maskrcnn_benchmark/csrc/ROIAlign.h
new file mode 100644
index 0000000000000000000000000000000000000000..2683dbf52e120eebb7b60bb2257cd3527c5a86c3
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/ROIAlign.h
@@ -0,0 +1,46 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+
+// Interface for Python
+at::Tensor ROIAlign_forward(const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio) {
+ if (input.device().is_cuda()) {
+#ifdef WITH_CUDA
+ return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+#else
+ AT_ERROR("Not compiled with GPU support");
+#endif
+ }
+ return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+}
+
+at::Tensor ROIAlign_backward(const at::Tensor& grad,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int batch_size,
+ const int channels,
+ const int height,
+ const int width,
+ const int sampling_ratio) {
+ if (grad.device().is_cuda()) {
+#ifdef WITH_CUDA
+ return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio);
+#else
+ AT_ERROR("Not compiled with GPU support");
+#endif
+ }
+ AT_ERROR("Not implemented on the CPU");
+}
+
diff --git a/maskrcnn_benchmark/csrc/ROIPool.h b/maskrcnn_benchmark/csrc/ROIPool.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b62b2dcb8f69ac65bc1fdf0eeb5fa556539bc13
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/ROIPool.h
@@ -0,0 +1,48 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+
+
+std::tuple ROIPool_forward(const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width) {
+ if (input.device().is_cuda()) {
+#ifdef WITH_CUDA
+ return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width);
+#else
+ AT_ERROR("Not compiled with GPU support");
+#endif
+ }
+ AT_ERROR("Not implemented on the CPU");
+}
+
+at::Tensor ROIPool_backward(const at::Tensor& grad,
+ const at::Tensor& input,
+ const at::Tensor& rois,
+ const at::Tensor& argmax,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int batch_size,
+ const int channels,
+ const int height,
+ const int width) {
+ if (grad.device().is_cuda()) {
+#ifdef WITH_CUDA
+ return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width);
+#else
+ AT_ERROR("Not compiled with GPU support");
+#endif
+ }
+ AT_ERROR("Not implemented on the CPU");
+}
+
+
+
diff --git a/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h b/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h
new file mode 100644
index 0000000000000000000000000000000000000000..e220c12ae558a176f6b4b0a6640e724358f2ecb0
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+
+// Interface for Python
+at::Tensor SigmoidFocalLoss_forward(
+ const at::Tensor& logits,
+ const at::Tensor& targets,
+ const int num_classes,
+ const float gamma,
+ const float alpha) {
+ if (logits.device().is_cuda()) {
+#ifdef WITH_CUDA
+ return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha);
+#else
+ AT_ERROR("Not compiled with GPU support");
+#endif
+ }
+ AT_ERROR("Not implemented on the CPU");
+}
+
+at::Tensor SigmoidFocalLoss_backward(
+ const at::Tensor& logits,
+ const at::Tensor& targets,
+ const at::Tensor& d_losses,
+ const int num_classes,
+ const float gamma,
+ const float alpha) {
+ if (logits.device().is_cuda()) {
+#ifdef WITH_CUDA
+ return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha);
+#else
+ AT_ERROR("Not compiled with GPU support");
+#endif
+ }
+ AT_ERROR("Not implemented on the CPU");
+}
diff --git a/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0c061351588df7752293ed84bba1c900768e3ab8
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp
@@ -0,0 +1,257 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include "cpu/vision.h"
+
+// implementation taken from Caffe2
+template
+struct PreCalc {
+ int pos1;
+ int pos2;
+ int pos3;
+ int pos4;
+ T w1;
+ T w2;
+ T w3;
+ T w4;
+};
+
+template
+void pre_calc_for_bilinear_interpolate(
+ const int height,
+ const int width,
+ const int pooled_height,
+ const int pooled_width,
+ const int iy_upper,
+ const int ix_upper,
+ T roi_start_h,
+ T roi_start_w,
+ T bin_size_h,
+ T bin_size_w,
+ int roi_bin_grid_h,
+ int roi_bin_grid_w,
+ std::vector>& pre_calc) {
+ int pre_calc_index = 0;
+ for (int ph = 0; ph < pooled_height; ph++) {
+ for (int pw = 0; pw < pooled_width; pw++) {
+ for (int iy = 0; iy < iy_upper; iy++) {
+ const T yy = roi_start_h + ph * bin_size_h +
+ static_cast(iy + .5f) * bin_size_h /
+ static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
+ for (int ix = 0; ix < ix_upper; ix++) {
+ const T xx = roi_start_w + pw * bin_size_w +
+ static_cast(ix + .5f) * bin_size_w /
+ static_cast(roi_bin_grid_w);
+
+ T x = xx;
+ T y = yy;
+ // deal with: inverse elements are out of feature map boundary
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
+ // empty
+ PreCalc pc;
+ pc.pos1 = 0;
+ pc.pos2 = 0;
+ pc.pos3 = 0;
+ pc.pos4 = 0;
+ pc.w1 = 0;
+ pc.w2 = 0;
+ pc.w3 = 0;
+ pc.w4 = 0;
+ pre_calc[pre_calc_index] = pc;
+ pre_calc_index += 1;
+ continue;
+ }
+
+ if (y <= 0) {
+ y = 0;
+ }
+ if (x <= 0) {
+ x = 0;
+ }
+
+ int y_low = (int)y;
+ int x_low = (int)x;
+ int y_high;
+ int x_high;
+
+ if (y_low >= height - 1) {
+ y_high = y_low = height - 1;
+ y = (T)y_low;
+ } else {
+ y_high = y_low + 1;
+ }
+
+ if (x_low >= width - 1) {
+ x_high = x_low = width - 1;
+ x = (T)x_low;
+ } else {
+ x_high = x_low + 1;
+ }
+
+ T ly = y - y_low;
+ T lx = x - x_low;
+ T hy = 1. - ly, hx = 1. - lx;
+ T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+ // save weights and indeces
+ PreCalc pc;
+ pc.pos1 = y_low * width + x_low;
+ pc.pos2 = y_low * width + x_high;
+ pc.pos3 = y_high * width + x_low;
+ pc.pos4 = y_high * width + x_high;
+ pc.w1 = w1;
+ pc.w2 = w2;
+ pc.w3 = w3;
+ pc.w4 = w4;
+ pre_calc[pre_calc_index] = pc;
+
+ pre_calc_index += 1;
+ }
+ }
+ }
+ }
+}
+
+template
+void ROIAlignForward_cpu_kernel(
+ const int nthreads,
+ const T* bottom_data,
+ const T& spatial_scale,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio,
+ const T* bottom_rois,
+ //int roi_cols,
+ T* top_data) {
+ //AT_ASSERT(roi_cols == 4 || roi_cols == 5);
+ int roi_cols = 5;
+
+ int n_rois = nthreads / channels / pooled_width / pooled_height;
+ // (n, c, ph, pw) is an element in the pooled output
+ // can be parallelized using omp
+ // #pragma omp parallel for num_threads(32)
+ for (int n = 0; n < n_rois; n++) {
+ int index_n = n * channels * pooled_width * pooled_height;
+
+ // roi could have 4 or 5 columns
+ const T* offset_bottom_rois = bottom_rois + n * roi_cols;
+ int roi_batch_ind = 0;
+ if (roi_cols == 5) {
+ roi_batch_ind = offset_bottom_rois[0];
+ offset_bottom_rois++;
+ }
+
+ // Do not using rounding; this implementation detail is critical
+ T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+ T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+ T roi_end_w = offset_bottom_rois[2] * spatial_scale;
+ T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+ // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
+ // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
+ // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
+ // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
+
+ // Force malformed ROIs to be 1x1
+ T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
+ T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
+
+ // We use roi_bin_grid to sample the grid and mimic integral
+ int roi_bin_grid_h = (sampling_ratio > 0)
+ ? sampling_ratio
+ : ceil(roi_height / pooled_height); // e.g., = 2
+ int roi_bin_grid_w =
+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+ // We do average (integral) pooling inside a bin
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+ // we want to precalculate indeces and weights shared by all chanels,
+ // this is the key point of optimiation
+ std::vector> pre_calc(
+ roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+ pre_calc_for_bilinear_interpolate(
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ roi_bin_grid_h,
+ roi_bin_grid_w,
+ roi_start_h,
+ roi_start_w,
+ bin_size_h,
+ bin_size_w,
+ roi_bin_grid_h,
+ roi_bin_grid_w,
+ pre_calc);
+
+ for (int c = 0; c < channels; c++) {
+ int index_n_c = index_n + c * pooled_width * pooled_height;
+ const T* offset_bottom_data =
+ bottom_data + (roi_batch_ind * channels + c) * height * width;
+ int pre_calc_index = 0;
+
+ for (int ph = 0; ph < pooled_height; ph++) {
+ for (int pw = 0; pw < pooled_width; pw++) {
+ int index = index_n_c + ph * pooled_width + pw;
+
+ T output_val = 0.;
+ for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+ for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+ PreCalc pc = pre_calc[pre_calc_index];
+ output_val += pc.w1 * offset_bottom_data[pc.pos1] +
+ pc.w2 * offset_bottom_data[pc.pos2] +
+ pc.w3 * offset_bottom_data[pc.pos3] +
+ pc.w4 * offset_bottom_data[pc.pos4];
+
+ pre_calc_index += 1;
+ }
+ }
+ output_val /= count;
+
+ top_data[index] = output_val;
+ } // for pw
+ } // for ph
+ } // for c
+ } // for n
+}
+
+at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio) {
+ AT_ASSERTM(!input.device().is_cuda(), "input must be a CPU tensor");
+ AT_ASSERTM(!rois.device().is_cuda(), "rois must be a CPU tensor");
+
+ auto num_rois = rois.size(0);
+ auto channels = input.size(1);
+ auto height = input.size(2);
+ auto width = input.size(3);
+
+ auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
+
+ if (output.numel() == 0) {
+ return output;
+ }
+
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
+ ROIAlignForward_cpu_kernel(
+ output_size,
+ input.data_ptr(),
+ spatial_scale,
+ channels,
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ sampling_ratio,
+ rois.data_ptr(),
+ output.data_ptr());
+ });
+ return output;
+}
diff --git a/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..11b7aa60fdca907352b334f142faadb46d662f99
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp
@@ -0,0 +1,75 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include "cpu/vision.h"
+
+
+template
+at::Tensor nms_cpu_kernel(const at::Tensor& dets,
+ const at::Tensor& scores,
+ const float threshold) {
+ AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
+ AT_ASSERTM(!scores.device().is_cuda(), "scores must be a CPU tensor");
+ AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
+
+ if (dets.numel() == 0) {
+ return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
+ }
+
+ auto x1_t = dets.select(1, 0).contiguous();
+ auto y1_t = dets.select(1, 1).contiguous();
+ auto x2_t = dets.select(1, 2).contiguous();
+ auto y2_t = dets.select(1, 3).contiguous();
+
+ at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
+
+ auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+ auto ndets = dets.size(0);
+ at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
+
+ auto suppressed = suppressed_t.data_ptr();
+ auto order = order_t.data_ptr();
+ auto x1 = x1_t.data_ptr();
+ auto y1 = y1_t.data_ptr();
+ auto x2 = x2_t.data_ptr();
+ auto y2 = y2_t.data_ptr();
+ auto areas = areas_t.data_ptr();
+
+ for (int64_t _i = 0; _i < ndets; _i++) {
+ auto i = order[_i];
+ if (suppressed[i] == 1)
+ continue;
+ auto ix1 = x1[i];
+ auto iy1 = y1[i];
+ auto ix2 = x2[i];
+ auto iy2 = y2[i];
+ auto iarea = areas[i];
+
+ for (int64_t _j = _i + 1; _j < ndets; _j++) {
+ auto j = order[_j];
+ if (suppressed[j] == 1)
+ continue;
+ auto xx1 = std::max(ix1, x1[j]);
+ auto yy1 = std::max(iy1, y1[j]);
+ auto xx2 = std::min(ix2, x2[j]);
+ auto yy2 = std::min(iy2, y2[j]);
+
+ auto w = std::max(static_cast(0), xx2 - xx1 + 1);
+ auto h = std::max(static_cast(0), yy2 - yy1 + 1);
+ auto inter = w * h;
+ auto ovr = inter / (iarea + areas[j] - inter);
+ if (ovr >= threshold)
+ suppressed[j] = 1;
+ }
+ }
+ return at::nonzero(suppressed_t == 0).squeeze(1);
+}
+
+at::Tensor nms_cpu(const at::Tensor& dets,
+ const at::Tensor& scores,
+ const float threshold) {
+ at::Tensor result;
+ AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] {
+ result = nms_cpu_kernel(dets, scores, threshold);
+ });
+ return result;
+}
diff --git a/maskrcnn_benchmark/csrc/cpu/soft_nms.cpp b/maskrcnn_benchmark/csrc/cpu/soft_nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..423941d71e29f5b9823006d57cdf0088646586ed
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cpu/soft_nms.cpp
@@ -0,0 +1,117 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include "cpu/vision.h"
+
+
+template
+std::pair soft_nms_cpu_kernel(const at::Tensor& dets,
+ const at::Tensor& scores,
+ const float threshold,
+ const float sigma) {
+ AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
+ AT_ASSERTM(!scores.device().is_cuda(), "scores must be a CPU tensor");
+ AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
+
+ if (dets.numel() == 0) {
+ return std::make_pair(at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)),
+ at::empty({0}, scores.options().dtype(at::kFloat).device(at::kCPU)));
+ }
+
+ auto x1_t = dets.select(1, 0).contiguous();
+ auto y1_t = dets.select(1, 1).contiguous();
+ auto x2_t = dets.select(1, 2).contiguous();
+ auto y2_t = dets.select(1, 3).contiguous();
+
+ auto scores_t = scores.clone();
+
+ at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
+ auto ndets = dets.size(0);
+ auto inds_t = at::arange(ndets, dets.options().dtype(at::kLong).device(at::kCPU));
+
+ auto x1 = x1_t.data_ptr();
+ auto y1 = y1_t.data_ptr();
+ auto x2 = x2_t.data_ptr();
+ auto y2 = y2_t.data_ptr();
+ auto s = scores_t.data_ptr();
+ auto inds = inds_t.data_ptr();
+ auto areas = areas_t.data_ptr();
+
+ for (int64_t i = 0; i < ndets; i++) {
+
+ auto ix1 = x1[i];
+ auto iy1 = y1[i];
+ auto ix2 = x2[i];
+ auto iy2 = y2[i];
+ auto is = s[i];
+ auto ii = inds[i];
+ auto iarea = areas[i];
+
+ auto maxpos = scores_t.slice(0, i, ndets).argmax().item() + i;
+
+ // add max box as a detection
+ x1[i] = x1[maxpos];
+ y1[i] = y1[maxpos];
+ x2[i] = x2[maxpos];
+ y2[i] = y2[maxpos];
+ s[i] = s[maxpos];
+ inds[i] = inds[maxpos];
+ areas[i] = areas[maxpos];
+
+ // swap ith box with position of max box
+ x1[maxpos] = ix1;
+ y1[maxpos] = iy1;
+ x2[maxpos] = ix2;
+ y2[maxpos] = iy2;
+ s[maxpos] = is;
+ inds[maxpos] = ii;
+ areas[maxpos] = iarea;
+
+ ix1 = x1[i];
+ iy1 = y1[i];
+ ix2 = x2[i];
+ iy2 = y2[i];
+ iarea = areas[i];
+
+ // NMS iterations, note that ndets changes if detection boxes
+ // fall below threshold
+ for (int64_t j = i + 1; j < ndets; j++) {
+ auto xx1 = std::max(ix1, x1[j]);
+ auto yy1 = std::max(iy1, y1[j]);
+ auto xx2 = std::min(ix2, x2[j]);
+ auto yy2 = std::min(iy2, y2[j]);
+
+ auto w = std::max(static_cast(0), xx2 - xx1 + 1);
+ auto h = std::max(static_cast(0), yy2 - yy1 + 1);
+
+ auto inter = w * h;
+ auto ovr = inter / (iarea + areas[j] - inter);
+
+ s[j] = s[j] * std::exp(- std::pow(ovr, 2.0) / sigma);
+
+ // if box score falls below threshold, discard the box by
+ // swapping with last box update ndets
+ if (s[j] < threshold) {
+ x1[j] = x1[ndets - 1];
+ y1[j] = y1[ndets - 1];
+ x2[j] = x2[ndets - 1];
+ y2[j] = y2[ndets - 1];
+ s[j] = s[ndets - 1];
+ inds[j] = inds[ndets - 1];
+ areas[j] = areas[ndets - 1];
+ j--;
+ ndets--;
+ }
+ }
+ }
+ return std::make_pair(inds_t.slice(0, 0, ndets), scores_t.slice(0, 0, ndets));
+}
+
+std::pair soft_nms_cpu(const at::Tensor& dets,
+ const at::Tensor& scores,
+ const float threshold,
+ const float sigma) {
+ std::pair result;
+ AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "soft_nms", [&] {
+ result = soft_nms_cpu_kernel(dets, scores, threshold, sigma);
+ });
+ return result;
+}
\ No newline at end of file
diff --git a/maskrcnn_benchmark/csrc/cpu/vision.h b/maskrcnn_benchmark/csrc/cpu/vision.h
new file mode 100644
index 0000000000000000000000000000000000000000..e00ef683150eb9d46d0e4f6a30f55a7230a52e93
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cpu/vision.h
@@ -0,0 +1,22 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+#include
+
+
+at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio);
+
+
+at::Tensor nms_cpu(const at::Tensor& dets,
+ const at::Tensor& scores,
+ const float threshold);
+
+
+std::pair soft_nms_cpu(const at::Tensor& dets,
+ const at::Tensor& scores,
+ const float threshold,
+ const float sigma);
\ No newline at end of file
diff --git a/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9ed1a0adfd841a17d3574dee6ac703820fcfe144
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu
@@ -0,0 +1,346 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include
+#include
+
+#include
+#include
+#include
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+ i += blockDim.x * gridDim.x)
+
+
+template
+__device__ T bilinear_interpolate(const T* bottom_data,
+ const int height, const int width,
+ T y, T x,
+ const int index /* index for debug only*/) {
+
+ // deal with cases that inverse elements are out of feature map boundary
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
+ //empty
+ return 0;
+ }
+
+ if (y <= 0) y = 0;
+ if (x <= 0) x = 0;
+
+ int y_low = (int) y;
+ int x_low = (int) x;
+ int y_high;
+ int x_high;
+
+ if (y_low >= height - 1) {
+ y_high = y_low = height - 1;
+ y = (T) y_low;
+ } else {
+ y_high = y_low + 1;
+ }
+
+ if (x_low >= width - 1) {
+ x_high = x_low = width - 1;
+ x = (T) x_low;
+ } else {
+ x_high = x_low + 1;
+ }
+
+ T ly = y - y_low;
+ T lx = x - x_low;
+ T hy = 1. - ly, hx = 1. - lx;
+ // do bilinear interpolation
+ T v1 = bottom_data[y_low * width + x_low];
+ T v2 = bottom_data[y_low * width + x_high];
+ T v3 = bottom_data[y_high * width + x_low];
+ T v4 = bottom_data[y_high * width + x_high];
+ T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+ T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+ return val;
+}
+
+template
+__global__ void RoIAlignForward(const int nthreads, const T* bottom_data,
+ const T spatial_scale, const int channels,
+ const int height, const int width,
+ const int pooled_height, const int pooled_width,
+ const int sampling_ratio,
+ const T* bottom_rois, T* top_data) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (n, c, ph, pw) is an element in the pooled output
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+
+ const T* offset_bottom_rois = bottom_rois + n * 5;
+ int roi_batch_ind = offset_bottom_rois[0];
+
+ // Do not using rounding; this implementation detail is critical
+ T roi_start_w = offset_bottom_rois[1] * spatial_scale;
+ T roi_start_h = offset_bottom_rois[2] * spatial_scale;
+ T roi_end_w = offset_bottom_rois[3] * spatial_scale;
+ T roi_end_h = offset_bottom_rois[4] * spatial_scale;
+ // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
+ // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
+ // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
+ // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
+
+ // Force malformed ROIs to be 1x1
+ T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+ T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
+
+ const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+ // We use roi_bin_grid to sample the grid and mimic integral
+ int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
+ int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+ // We do average (integral) pooling inside a bin
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+ T output_val = 0.;
+ for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
+ {
+ const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
+ for (int ix = 0; ix < roi_bin_grid_w; ix ++)
+ {
+ const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w);
+
+ T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, index);
+ output_val += val;
+ }
+ }
+ output_val /= count;
+
+ top_data[index] = output_val;
+ }
+}
+
+
+template
+__device__ void bilinear_interpolate_gradient(
+ const int height, const int width,
+ T y, T x,
+ T & w1, T & w2, T & w3, T & w4,
+ int & x_low, int & x_high, int & y_low, int & y_high,
+ const int index /* index for debug only*/) {
+
+ // deal with cases that inverse elements are out of feature map boundary
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
+ //empty
+ w1 = w2 = w3 = w4 = 0.;
+ x_low = x_high = y_low = y_high = -1;
+ return;
+ }
+
+ if (y <= 0) y = 0;
+ if (x <= 0) x = 0;
+
+ y_low = (int) y;
+ x_low = (int) x;
+
+ if (y_low >= height - 1) {
+ y_high = y_low = height - 1;
+ y = (T) y_low;
+ } else {
+ y_high = y_low + 1;
+ }
+
+ if (x_low >= width - 1) {
+ x_high = x_low = width - 1;
+ x = (T) x_low;
+ } else {
+ x_high = x_low + 1;
+ }
+
+ T ly = y - y_low;
+ T lx = x - x_low;
+ T hy = 1. - ly, hx = 1. - lx;
+
+ // reference in forward
+ // T v1 = bottom_data[y_low * width + x_low];
+ // T v2 = bottom_data[y_low * width + x_high];
+ // T v3 = bottom_data[y_high * width + x_low];
+ // T v4 = bottom_data[y_high * width + x_high];
+ // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+ w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+ return;
+}
+
+template
+__global__ void RoIAlignBackwardFeature(const int nthreads, const T* top_diff,
+ const int num_rois, const T spatial_scale,
+ const int channels, const int height, const int width,
+ const int pooled_height, const int pooled_width,
+ const int sampling_ratio,
+ T* bottom_diff,
+ const T* bottom_rois) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (n, c, ph, pw) is an element in the pooled output
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+
+ const T* offset_bottom_rois = bottom_rois + n * 5;
+ int roi_batch_ind = offset_bottom_rois[0];
+
+ // Do not using rounding; this implementation detail is critical
+ T roi_start_w = offset_bottom_rois[1] * spatial_scale;
+ T roi_start_h = offset_bottom_rois[2] * spatial_scale;
+ T roi_end_w = offset_bottom_rois[3] * spatial_scale;
+ T roi_end_h = offset_bottom_rois[4] * spatial_scale;
+ // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
+ // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
+ // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
+ // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
+
+ // Force malformed ROIs to be 1x1
+ T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+ T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
+
+ T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+ int top_offset = (n * channels + c) * pooled_height * pooled_width;
+ const T* offset_top_diff = top_diff + top_offset;
+ const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+ // We use roi_bin_grid to sample the grid and mimic integral
+ int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
+ int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+ // We do average (integral) pooling inside a bin
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+ for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
+ {
+ const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
+ for (int ix = 0; ix < roi_bin_grid_w; ix ++)
+ {
+ const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w);
+
+ T w1, w2, w3, w4;
+ int x_low, x_high, y_low, y_high;
+
+ bilinear_interpolate_gradient(height, width, y, x,
+ w1, w2, w3, w4,
+ x_low, x_high, y_low, y_high,
+ index);
+
+ T g1 = top_diff_this_bin * w1 / count;
+ T g2 = top_diff_this_bin * w2 / count;
+ T g3 = top_diff_this_bin * w3 / count;
+ T g4 = top_diff_this_bin * w4 / count;
+
+ if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0)
+ {
+ atomicAdd(offset_bottom_diff + y_low * width + x_low, static_cast(g1));
+ atomicAdd(offset_bottom_diff + y_low * width + x_high, static_cast(g2));
+ atomicAdd(offset_bottom_diff + y_high * width + x_low, static_cast(g3));
+ atomicAdd(offset_bottom_diff + y_high * width + x_high, static_cast(g4));
+ } // if
+ } // ix
+ } // iy
+ } // CUDA_1D_KERNEL_LOOP
+} // RoIAlignBackward
+
+
+at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio) {
+ AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+ auto num_rois = rois.size(0);
+ auto channels = input.size(1);
+ auto height = input.size(2);
+ auto width = input.size(3);
+
+ auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
+ dim3 block(512);
+
+ if (output.numel() == 0) {
+ THCudaCheck(cudaGetLastError());
+ return output;
+ }
+
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
+ RoIAlignForward<<>>(
+ output_size,
+ input.contiguous().data_ptr(),
+ spatial_scale,
+ channels,
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ sampling_ratio,
+ rois.contiguous().data_ptr(),
+ output.data_ptr());
+ });
+ THCudaCheck(cudaGetLastError());
+ return output;
+}
+
+// TODO remove the dependency on input and use instead its sizes -> save memory
+at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int batch_size,
+ const int channels,
+ const int height,
+ const int width,
+ const int sampling_ratio) {
+ AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+ auto num_rois = rois.size(0);
+ auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
+ dim3 block(512);
+
+ // handle possibly empty gradients
+ if (grad.numel() == 0) {
+ THCudaCheck(cudaGetLastError());
+ return grad_input;
+ }
+
+ AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] {
+ RoIAlignBackwardFeature<<>>(
+ grad.numel(),
+ grad.contiguous().data_ptr(),
+ num_rois,
+ spatial_scale,
+ channels,
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ sampling_ratio,
+ grad_input.data_ptr(),
+ rois.contiguous().data_ptr());
+ });
+ THCudaCheck(cudaGetLastError());
+ return grad_input;
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..60fc9fbc55956304c7ff6b48cbf3c086029b8354
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu
@@ -0,0 +1,202 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include
+#include
+
+#include
+#include
+#include
+
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+ i += blockDim.x * gridDim.x)
+
+
+template
+__global__ void RoIPoolFForward(const int nthreads, const T* bottom_data,
+ const T spatial_scale, const int channels, const int height,
+ const int width, const int pooled_height, const int pooled_width,
+ const T* bottom_rois, T* top_data, int* argmax_data) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (n, c, ph, pw) is an element in the pooled output
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+
+ const T* offset_bottom_rois = bottom_rois + n * 5;
+ int roi_batch_ind = offset_bottom_rois[0];
+ int roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
+ int roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
+ int roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
+ int roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
+
+ // Force malformed ROIs to be 1x1
+ int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+ int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+ T bin_size_h = static_cast(roi_height)
+ / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width)
+ / static_cast(pooled_width);
+
+ int hstart = static_cast(floor(static_cast(ph)
+ * bin_size_h));
+ int wstart = static_cast(floor(static_cast(pw)
+ * bin_size_w));
+ int hend = static_cast(ceil(static_cast(ph + 1)
+ * bin_size_h));
+ int wend = static_cast(ceil(static_cast(pw + 1)
+ * bin_size_w));
+
+ // Add roi offsets and clip to input boundaries
+ hstart = min(max(hstart + roi_start_h, 0), height);
+ hend = min(max(hend + roi_start_h, 0), height);
+ wstart = min(max(wstart + roi_start_w, 0), width);
+ wend = min(max(wend + roi_start_w, 0), width);
+ bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+ // Define an empty pooling region to be zero
+ T maxval = is_empty ? 0 : -FLT_MAX;
+ // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+ int maxidx = -1;
+ const T* offset_bottom_data =
+ bottom_data + (roi_batch_ind * channels + c) * height * width;
+ for (int h = hstart; h < hend; ++h) {
+ for (int w = wstart; w < wend; ++w) {
+ int bottom_index = h * width + w;
+ if (offset_bottom_data[bottom_index] > maxval) {
+ maxval = offset_bottom_data[bottom_index];
+ maxidx = bottom_index;
+ }
+ }
+ }
+ top_data[index] = maxval;
+ argmax_data[index] = maxidx;
+ }
+}
+
+template
+__global__ void RoIPoolFBackward(const int nthreads, const T* top_diff,
+ const int* argmax_data, const int num_rois, const T spatial_scale,
+ const int channels, const int height, const int width,
+ const int pooled_height, const int pooled_width, T* bottom_diff,
+ const T* bottom_rois) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (n, c, ph, pw) is an element in the pooled output
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+
+ const T* offset_bottom_rois = bottom_rois + n * 5;
+ int roi_batch_ind = offset_bottom_rois[0];
+ int bottom_offset = (roi_batch_ind * channels + c) * height * width;
+ int top_offset = (n * channels + c) * pooled_height * pooled_width;
+ const T* offset_top_diff = top_diff + top_offset;
+ T* offset_bottom_diff = bottom_diff + bottom_offset;
+ const int* offset_argmax_data = argmax_data + top_offset;
+
+ int argmax = offset_argmax_data[ph * pooled_width + pw];
+ if (argmax != -1) {
+ atomicAdd(
+ offset_bottom_diff + argmax,
+ static_cast(offset_top_diff[ph * pooled_width + pw]));
+
+ }
+ }
+}
+
+std::tuple ROIPool_forward_cuda(const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width) {
+ AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+ auto num_rois = rois.size(0);
+ auto channels = input.size(1);
+ auto height = input.size(2);
+ auto width = input.size(3);
+
+ auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
+ auto argmax = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options().dtype(at::kInt));
+
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
+ dim3 block(512);
+
+ if (output.numel() == 0) {
+ THCudaCheck(cudaGetLastError());
+ return std::make_tuple(output, argmax);
+ }
+
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIPool_forward", [&] {
+ RoIPoolFForward<<>>(
+ output_size,
+ input.contiguous().data_ptr(),
+ spatial_scale,
+ channels,
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ rois.contiguous().data_ptr(),
+ output.data_ptr(),
+ argmax.data_ptr());
+ });
+ THCudaCheck(cudaGetLastError());
+ return std::make_tuple(output, argmax);
+}
+
+// TODO remove the dependency on input and use instead its sizes -> save memory
+at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
+ const at::Tensor& input,
+ const at::Tensor& rois,
+ const at::Tensor& argmax,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int batch_size,
+ const int channels,
+ const int height,
+ const int width) {
+ AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+ // TODO add more checks
+
+ auto num_rois = rois.size(0);
+ auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
+ dim3 block(512);
+
+ // handle possibly empty gradients
+ if (grad.numel() == 0) {
+ THCudaCheck(cudaGetLastError());
+ return grad_input;
+ }
+
+ AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIPool_backward", [&] {
+ RoIPoolFBackward<<>>(
+ grad.numel(),
+ grad.contiguous().data_ptr(),
+ argmax.data_ptr(),
+ num_rois,
+ spatial_scale,
+ channels,
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ grad_input.data_ptr(),
+ rois.contiguous().data_ptr());
+ });
+ THCudaCheck(cudaGetLastError());
+ return grad_input;
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8aeceae0f825598cd36ea99add8da613c5e2482a
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu
@@ -0,0 +1,188 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This file is modified from https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu
+// Cheng-Yang Fu
+// cyfu@cs.unc.edu
+#include
+#include
+
+#include
+#include
+#include
+
+#include
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+ i += blockDim.x * gridDim.x)
+
+
+template
+__global__ void SigmoidFocalLossForward(const int nthreads,
+ const T* logits,
+ const int* targets,
+ const int num_classes,
+ const float gamma,
+ const float alpha,
+ const int num,
+ T* losses) {
+ CUDA_1D_KERNEL_LOOP(i, nthreads) {
+
+ int n = i / num_classes;
+ int d = i % num_classes; // current class[0~79];
+ int t = targets[n]; // target class [1~80];
+
+ // Decide it is positive or negative case.
+ T c1 = (t == (d+1));
+ T c2 = (t>=0 & t != (d+1));
+
+ T zn = (1.0 - alpha);
+ T zp = (alpha);
+
+ // p = 1. / 1. + expf(-x); p = sigmoid(x)
+ T p = 1. / (1. + expf(-logits[i]));
+
+ // (1-p)**gamma * log(p) where
+ T term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN));
+
+ // p**gamma * log(1-p)
+ T term2 = powf(p, gamma) *
+ (-1. * logits[i] * (logits[i] >= 0) -
+ logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));
+
+ losses[i] = 0.0;
+ losses[i] += -c1 * term1 * zp;
+ losses[i] += -c2 * term2 * zn;
+
+ } // CUDA_1D_KERNEL_LOOP
+} // SigmoidFocalLossForward
+
+
+template
+__global__ void SigmoidFocalLossBackward(const int nthreads,
+ const T* logits,
+ const int* targets,
+ const T* d_losses,
+ const int num_classes,
+ const float gamma,
+ const float alpha,
+ const int num,
+ T* d_logits) {
+ CUDA_1D_KERNEL_LOOP(i, nthreads) {
+
+ int n = i / num_classes;
+ int d = i % num_classes; // current class[0~79];
+ int t = targets[n]; // target class [1~80], 0 is background;
+
+ // Decide it is positive or negative case.
+ T c1 = (t == (d+1));
+ T c2 = (t>=0 & t != (d+1));
+
+ T zn = (1.0 - alpha);
+ T zp = (alpha);
+ // p = 1. / 1. + expf(-x); p = sigmoid(x)
+ T p = 1. / (1. + expf(-logits[i]));
+
+ // (1-p)**g * (1 - p - g*p*log(p)
+ T term1 = powf((1. - p), gamma) *
+ (1. - p - (p * gamma * logf(max(p, FLT_MIN))));
+
+ // (p**g) * (g*(1-p)*log(1-p) - p)
+ T term2 = powf(p, gamma) *
+ ((-1. * logits[i] * (logits[i] >= 0) -
+ logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
+ (1. - p) * gamma - p);
+ d_logits[i] = 0.0;
+ d_logits[i] += -c1 * term1 * zp;
+ d_logits[i] += -c2 * term2 * zn;
+ d_logits[i] = d_logits[i] * d_losses[i];
+
+ } // CUDA_1D_KERNEL_LOOP
+} // SigmoidFocalLossBackward
+
+
+at::Tensor SigmoidFocalLoss_forward_cuda(
+ const at::Tensor& logits,
+ const at::Tensor& targets,
+ const int num_classes,
+ const float gamma,
+ const float alpha) {
+ AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
+ AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
+ AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
+
+ const int num_samples = logits.size(0);
+
+ auto losses = at::empty({num_samples, logits.size(1)}, logits.options());
+ auto losses_size = num_samples * logits.size(1);
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ dim3 grid(std::min(THCCeilDiv(losses_size, 512L), 4096L));
+ dim3 block(512);
+
+ if (losses.numel() == 0) {
+ THCudaCheck(cudaGetLastError());
+ return losses;
+ }
+
+ AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_forward", [&] {
+ SigmoidFocalLossForward<<>>(
+ losses_size,
+ logits.contiguous().data_ptr(),
+ targets.contiguous().data_ptr(),
+ num_classes,
+ gamma,
+ alpha,
+ num_samples,
+ losses.data_ptr());
+ });
+ THCudaCheck(cudaGetLastError());
+ return losses;
+}
+
+
+at::Tensor SigmoidFocalLoss_backward_cuda(
+ const at::Tensor& logits,
+ const at::Tensor& targets,
+ const at::Tensor& d_losses,
+ const int num_classes,
+ const float gamma,
+ const float alpha) {
+ AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
+ AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
+ AT_ASSERTM(d_losses.device().is_cuda(), "d_losses must be a CUDA tensor");
+
+ AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
+
+ const int num_samples = logits.size(0);
+ AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes");
+
+ auto d_logits = at::zeros({num_samples, num_classes}, logits.options());
+ auto d_logits_size = num_samples * logits.size(1);
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ dim3 grid(std::min(THCCeilDiv(d_logits_size, 512L), 4096L));
+ dim3 block(512);
+
+ if (d_logits.numel() == 0) {
+ THCudaCheck(cudaGetLastError());
+ return d_logits;
+ }
+
+ AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_backward", [&] {
+ SigmoidFocalLossBackward<<>>(
+ d_logits_size,
+ logits.contiguous().data_ptr(),
+ targets.contiguous().data_ptr(),
+ d_losses.contiguous().data_ptr(),
+ num_classes,
+ gamma,
+ alpha,
+ num_samples,
+ d_logits.data_ptr());
+ });
+
+ THCudaCheck(cudaGetLastError());
+ return d_logits;
+}
+
diff --git a/maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu b/maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2cdf8d61957e50d452dd230c97b5754dacd2fa0e
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu
@@ -0,0 +1,691 @@
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
+
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+#include
+
+
+void deformable_im2col(const at::Tensor data_im, const at::Tensor data_offset,
+ const int channels, const int height, const int width,
+ const int ksize_h, const int ksize_w, const int pad_h,
+ const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int parallel_imgs, const int deformable_group,
+ at::Tensor data_col);
+
+void deformable_col2im(const at::Tensor data_col, const at::Tensor data_offset,
+ const int channels, const int height, const int width,
+ const int ksize_h, const int ksize_w, const int pad_h,
+ const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int parallel_imgs, const int deformable_group,
+ at::Tensor grad_im);
+
+void deformable_col2im_coord(
+ const at::Tensor data_col, const at::Tensor data_im,
+ const at::Tensor data_offset, const int channels, const int height,
+ const int width, const int ksize_h, const int ksize_w, const int pad_h,
+ const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w, const int parallel_imgs,
+ const int deformable_group, at::Tensor grad_offset);
+
+void modulated_deformable_im2col_cuda(
+ const at::Tensor data_im, const at::Tensor data_offset,
+ const at::Tensor data_mask, const int batch_size, const int channels,
+ const int height_im, const int width_im, const int height_col,
+ const int width_col, const int kernel_h, const int kenerl_w,
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w, const int deformable_group,
+ at::Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+ const at::Tensor data_col, const at::Tensor data_offset,
+ const at::Tensor data_mask, const int batch_size, const int channels,
+ const int height_im, const int width_im, const int height_col,
+ const int width_col, const int kernel_h, const int kenerl_w,
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w, const int deformable_group,
+ at::Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+ const at::Tensor data_col, const at::Tensor data_im,
+ const at::Tensor data_offset, const at::Tensor data_mask,
+ const int batch_size, const int channels, const int height_im,
+ const int width_im, const int height_col, const int width_col,
+ const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w, const int dilation_h,
+ const int dilation_w, const int deformable_group, at::Tensor grad_offset,
+ at::Tensor grad_mask);
+
+void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
+ at::Tensor weight, int kH, int kW, int dH, int dW, int padH,
+ int padW, int dilationH, int dilationW, int group,
+ int deformable_group)
+{
+ TORCH_CHECK(weight.ndimension() == 4,
+ "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
+ "but got: %s",
+ weight.ndimension());
+
+ TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+ TORCH_CHECK(kW > 0 && kH > 0,
+ "kernel size should be greater than zero, but got kH: %d kW: %d", kH,
+ kW);
+
+ TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+ "kernel size should be consistent with weight, ",
+ "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
+ kW, weight.size(2), weight.size(3));
+
+ TORCH_CHECK(dW > 0 && dH > 0,
+ "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+ TORCH_CHECK(
+ dilationW > 0 && dilationH > 0,
+ "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+ dilationH, dilationW);
+
+ int ndim = input.ndimension();
+ int dimf = 0;
+ int dimh = 1;
+ int dimw = 2;
+
+ if (ndim == 4) {
+ dimf++;
+ dimh++;
+ dimw++;
+ }
+
+ TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
+ ndim);
+
+ long nInputPlane = weight.size(1) * group;
+ long inputHeight = input.size(dimh);
+ long inputWidth = input.size(dimw);
+ long nOutputPlane = weight.size(0);
+ long outputHeight =
+ (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+ long outputWidth =
+ (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+ TORCH_CHECK(nInputPlane % deformable_group == 0,
+ "input channels must divide deformable group size");
+
+ if (outputWidth < 1 || outputHeight < 1)
+ AT_ERROR(
+ "Given input size: (%ld x %ld x %ld). "
+ "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+ nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+ outputWidth);
+
+ TORCH_CHECK(input.size(1) == nInputPlane,
+ "invalid number of input planes, expected: %d, but got: %d",
+ nInputPlane, input.size(1));
+
+ TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
+ "input image is smaller than kernel");
+
+ TORCH_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+ "invalid spatial size of offset, expected height: %d width: %d, but "
+ "got height: %d width: %d",
+ outputHeight, outputWidth, offset.size(2), offset.size(3));
+
+ TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+ "invalid number of channels of offset");
+
+ if (gradOutput != NULL) {
+ TORCH_CHECK(gradOutput->size(dimf) == nOutputPlane,
+ "invalid number of gradOutput planes, expected: %d, but got: %d",
+ nOutputPlane, gradOutput->size(dimf));
+
+ TORCH_CHECK((gradOutput->size(dimh) == outputHeight &&
+ gradOutput->size(dimw) == outputWidth),
+ "invalid size of gradOutput, expected height: %d width: %d , but "
+ "got height: %d width: %d",
+ outputHeight, outputWidth, gradOutput->size(dimh),
+ gradOutput->size(dimw));
+ }
+}
+
+int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
+ at::Tensor offset, at::Tensor output,
+ at::Tensor columns, at::Tensor ones, int kW,
+ int kH, int dW, int dH, int padW, int padH,
+ int dilationW, int dilationH, int group,
+ int deformable_group, int im2col_step)
+{
+ // todo: resize columns to include im2col: done
+ // todo: add im2col_step as input
+ // todo: add new output buffer and transpose it to output (or directly
+ // transpose output) todo: possibly change data indexing because of
+ // parallel_imgs
+
+ shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH, padW,
+ dilationH, dilationW, group, deformable_group);
+
+ input = input.contiguous();
+ offset = offset.contiguous();
+ weight = weight.contiguous();
+
+ int batch = 1;
+ if (input.ndimension() == 3) {
+ // Force batch
+ batch = 0;
+ input.unsqueeze_(0);
+ offset.unsqueeze_(0);
+ }
+
+ // todo: assert batchsize dividable by im2col_step
+
+ long batchSize = input.size(0);
+ long nInputPlane = input.size(1);
+ long inputHeight = input.size(2);
+ long inputWidth = input.size(3);
+
+ long nOutputPlane = weight.size(0);
+
+ long outputWidth =
+ (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+ long outputHeight =
+ (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+ TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+ output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
+ outputHeight, outputWidth});
+ columns = at::zeros(
+ {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+ input.options());
+
+ if (ones.ndimension() != 2 ||
+ ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+ ones = at::ones({outputHeight, outputWidth}, input.options());
+ }
+
+ input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+ inputHeight, inputWidth});
+ offset =
+ offset.view({batchSize / im2col_step, im2col_step,
+ deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+ at::Tensor output_buffer =
+ at::zeros({batchSize / im2col_step, nOutputPlane,
+ im2col_step * outputHeight, outputWidth},
+ output.options());
+
+ output_buffer = output_buffer.view(
+ {output_buffer.size(0), group, output_buffer.size(1) / group,
+ output_buffer.size(2), output_buffer.size(3)});
+
+ for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+ deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+ inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+ dilationW, im2col_step, deformable_group, columns);
+
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+ weight = weight.view({group, weight.size(0) / group, weight.size(1),
+ weight.size(2), weight.size(3)});
+
+ for (int g = 0; g < group; g++) {
+ output_buffer[elt][g] = output_buffer[elt][g]
+ .flatten(1)
+ .addmm_(weight[g].flatten(1), columns[g])
+ .view_as(output_buffer[elt][g]);
+ }
+ }
+
+ output_buffer = output_buffer.view(
+ {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
+ output_buffer.size(3), output_buffer.size(4)});
+
+ output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
+ im2col_step, outputHeight, outputWidth});
+ output_buffer.transpose_(1, 2);
+ output.copy_(output_buffer);
+ output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+ input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+ offset = offset.view(
+ {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+ if (batch == 0) {
+ output = output.view({nOutputPlane, outputHeight, outputWidth});
+ input = input.view({nInputPlane, inputHeight, inputWidth});
+ offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+ }
+
+ return 1;
+}
+
+int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
+ at::Tensor gradOutput, at::Tensor gradInput,
+ at::Tensor gradOffset, at::Tensor weight,
+ at::Tensor columns, int kW, int kH, int dW,
+ int dH, int padW, int padH, int dilationW,
+ int dilationH, int group,
+ int deformable_group, int im2col_step)
+{
+ shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW, padH, padW,
+ dilationH, dilationW, group, deformable_group);
+
+ input = input.contiguous();
+ offset = offset.contiguous();
+ gradOutput = gradOutput.contiguous();
+ weight = weight.contiguous();
+
+ int batch = 1;
+
+ if (input.ndimension() == 3) {
+ // Force batch
+ batch = 0;
+ input = input.view({1, input.size(0), input.size(1), input.size(2)});
+ offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+ gradOutput = gradOutput.view(
+ {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+ }
+
+ long batchSize = input.size(0);
+ long nInputPlane = input.size(1);
+ long inputHeight = input.size(2);
+ long inputWidth = input.size(3);
+
+ long nOutputPlane = weight.size(0);
+
+ long outputWidth =
+ (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+ long outputHeight =
+ (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+ TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+ gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+ columns = at::zeros(
+ {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+ input.options());
+
+ // change order of grad output
+ gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+ nOutputPlane, outputHeight, outputWidth});
+ gradOutput.transpose_(1, 2);
+
+ gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
+ inputHeight, inputWidth});
+ input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+ inputHeight, inputWidth});
+ gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
+ deformable_group * 2 * kH * kW, outputHeight,
+ outputWidth});
+ offset =
+ offset.view({batchSize / im2col_step, im2col_step,
+ deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+ for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+ // divide into groups
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+ weight = weight.view({group, weight.size(0) / group, weight.size(1),
+ weight.size(2), weight.size(3)});
+ gradOutput = gradOutput.view(
+ {gradOutput.size(0), group, gradOutput.size(1) / group,
+ gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
+
+ for (int g = 0; g < group; g++) {
+ columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+ gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
+ }
+
+ columns =
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+ gradOutput = gradOutput.view(
+ {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
+ gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
+
+ deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
+ inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+ dilationH, dilationW, im2col_step, deformable_group,
+ gradOffset[elt]);
+
+ deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
+ inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+ dilationW, im2col_step, deformable_group, gradInput[elt]);
+ }
+
+ gradOutput.transpose_(1, 2);
+ gradOutput =
+ gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+ gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+ input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+ gradOffset = gradOffset.view(
+ {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+ offset = offset.view(
+ {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+ if (batch == 0) {
+ gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+ input = input.view({nInputPlane, inputHeight, inputWidth});
+ gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+ offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+ gradOffset =
+ gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+ }
+
+ return 1;
+}
+
+int deform_conv_backward_parameters_cuda(
+ at::Tensor input, at::Tensor offset, at::Tensor gradOutput,
+ at::Tensor gradWeight, // at::Tensor gradBias,
+ at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH,
+ int padW, int padH, int dilationW, int dilationH, int group,
+ int deformable_group, float scale, int im2col_step)
+{
+ // todo: transpose and reshape outGrad
+ // todo: reshape columns
+ // todo: add im2col_step as input
+
+ shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH, dW, padH,
+ padW, dilationH, dilationW, group, deformable_group);
+
+ input = input.contiguous();
+ offset = offset.contiguous();
+ gradOutput = gradOutput.contiguous();
+
+ int batch = 1;
+
+ if (input.ndimension() == 3) {
+ // Force batch
+ batch = 0;
+ input = input.view(
+ at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+ gradOutput = gradOutput.view(
+ {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+ }
+
+ long batchSize = input.size(0);
+ long nInputPlane = input.size(1);
+ long inputHeight = input.size(2);
+ long inputWidth = input.size(3);
+
+ long nOutputPlane = gradWeight.size(0);
+
+ long outputWidth =
+ (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+ long outputHeight =
+ (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+ TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+ columns = at::zeros(
+ {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+ input.options());
+
+ gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+ nOutputPlane, outputHeight, outputWidth});
+ gradOutput.transpose_(1, 2);
+
+ at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+ gradOutputBuffer =
+ gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+ outputHeight, outputWidth});
+ gradOutputBuffer.copy_(gradOutput);
+ gradOutputBuffer =
+ gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
+ im2col_step * outputHeight, outputWidth});
+
+ gradOutput.transpose_(1, 2);
+ gradOutput =
+ gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+ input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+ inputHeight, inputWidth});
+ offset =
+ offset.view({batchSize / im2col_step, im2col_step,
+ deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+ for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+ deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+ inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+ dilationW, im2col_step, deformable_group, columns);
+
+ // divide into group
+ gradOutputBuffer = gradOutputBuffer.view(
+ {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
+ gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+ gradWeight =
+ gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
+ gradWeight.size(2), gradWeight.size(3)});
+
+ for (int g = 0; g < group; g++) {
+ gradWeight[g] = gradWeight[g]
+ .flatten(1)
+ .addmm_(gradOutputBuffer[elt][g].flatten(1),
+ columns[g].transpose(1, 0), 1.0, scale)
+ .view_as(gradWeight[g]);
+ }
+ gradOutputBuffer = gradOutputBuffer.view(
+ {gradOutputBuffer.size(0),
+ gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+ gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
+ columns =
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+ gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+ gradWeight.size(2), gradWeight.size(3),
+ gradWeight.size(4)});
+ }
+
+ input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+ offset = offset.view(
+ {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+ if (batch == 0) {
+ gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+ input = input.view({nInputPlane, inputHeight, inputWidth});
+ }
+
+ return 1;
+}
+
+void modulated_deform_conv_cuda_forward(
+ at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
+ at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns,
+ int kernel_h, int kernel_w, const int stride_h, const int stride_w,
+ const int pad_h, const int pad_w, const int dilation_h,
+ const int dilation_w, const int group, const int deformable_group,
+ const bool with_bias)
+{
+ TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+ TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+ const int batch = input.size(0);
+ const int channels = input.size(1);
+ const int height = input.size(2);
+ const int width = input.size(3);
+
+ const int channels_out = weight.size(0);
+ const int channels_kernel = weight.size(1);
+ const int kernel_h_ = weight.size(2);
+ const int kernel_w_ = weight.size(3);
+
+ if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+ AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+ kernel_h_, kernel_w, kernel_h_, kernel_w_);
+ if (channels != channels_kernel * group)
+ AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
+ channels, channels_kernel * group);
+
+ const int height_out =
+ (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+ const int width_out =
+ (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+ if (ones.ndimension() != 2 ||
+ ones.size(0) * ones.size(1) < height_out * width_out) {
+ // Resize plane and fill with ones...
+ ones = at::ones({height_out, width_out}, input.options());
+ }
+
+ // resize output
+ output = output.view({batch, channels_out, height_out, width_out}).zero_();
+ // resize temporary columns
+ columns =
+ at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
+ input.options());
+
+ output = output.view({output.size(0), group, output.size(1) / group,
+ output.size(2), output.size(3)});
+
+ for (int b = 0; b < batch; b++) {
+ modulated_deformable_im2col_cuda(
+ input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+ width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+ dilation_h, dilation_w, deformable_group, columns);
+
+ // divide into group
+ weight = weight.view({group, weight.size(0) / group, weight.size(1),
+ weight.size(2), weight.size(3)});
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+ for (int g = 0; g < group; g++) {
+ output[b][g] = output[b][g]
+ .flatten(1)
+ .addmm_(weight[g].flatten(1), columns[g])
+ .view_as(output[b][g]);
+ }
+
+ weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+ weight.size(3), weight.size(4)});
+ columns =
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+ }
+
+ output = output.view({output.size(0), output.size(1) * output.size(2),
+ output.size(3), output.size(4)});
+
+ if (with_bias) {
+ output += bias.view({1, bias.size(0), 1, 1});
+ }
+}
+
+void modulated_deform_conv_cuda_backward(
+ at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
+ at::Tensor offset, at::Tensor mask, at::Tensor columns,
+ at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias,
+ at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output,
+ int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+ int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+ const bool with_bias)
+{
+ TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+ TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+ const int batch = input.size(0);
+ const int channels = input.size(1);
+ const int height = input.size(2);
+ const int width = input.size(3);
+
+ const int channels_kernel = weight.size(1);
+ const int kernel_h_ = weight.size(2);
+ const int kernel_w_ = weight.size(3);
+ if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+ AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+ kernel_h_, kernel_w, kernel_h_, kernel_w_);
+ if (channels != channels_kernel * group)
+ AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
+ channels, channels_kernel * group);
+
+ const int height_out =
+ (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+ const int width_out =
+ (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+ if (ones.ndimension() != 2 ||
+ ones.size(0) * ones.size(1) < height_out * width_out) {
+ // Resize plane and fill with ones...
+ ones = at::ones({height_out, width_out}, input.options());
+ }
+
+ grad_input = grad_input.view({batch, channels, height, width});
+ columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
+ input.options());
+
+ grad_output =
+ grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
+ grad_output.size(2), grad_output.size(3)});
+
+ for (int b = 0; b < batch; b++) {
+ // divide int group
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+ weight = weight.view({group, weight.size(0) / group, weight.size(1),
+ weight.size(2), weight.size(3)});
+
+ for (int g = 0; g < group; g++) {
+ columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+ grad_output[b][g].flatten(1), 0.0f, 1.0f);
+ }
+
+ columns =
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+ weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+ weight.size(3), weight.size(4)});
+
+ // gradient w.r.t. input coordinate data
+ modulated_deformable_col2im_coord_cuda(
+ columns, input[b], offset[b], mask[b], 1, channels, height, width,
+ height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+ stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+ grad_mask[b]);
+ // gradient w.r.t. input data
+ modulated_deformable_col2im_cuda(
+ columns, offset[b], mask[b], 1, channels, height, width, height_out,
+ width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+ dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+ // gradient w.r.t. weight, dWeight should accumulate across the batch and
+ // group
+ modulated_deformable_im2col_cuda(
+ input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+ width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+ dilation_h, dilation_w, deformable_group, columns);
+
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+ grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
+ grad_weight.size(1), grad_weight.size(2),
+ grad_weight.size(3)});
+ if (with_bias)
+ grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+ for (int g = 0; g < group; g++) {
+ grad_weight[g] =
+ grad_weight[g]
+ .flatten(1)
+ .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+ .view_as(grad_weight[g]);
+ if (with_bias) {
+ grad_bias[g] =
+ grad_bias[g]
+ .view({-1, 1})
+ .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+ .view(-1);
+ }
+ }
+
+ columns =
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+ grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+ grad_weight.size(2), grad_weight.size(3),
+ grad_weight.size(4)});
+ if (with_bias)
+ grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+ }
+ grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+ grad_output.size(2), grad_output.size(3),
+ grad_output.size(4)});
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/deform_conv_kernel_cuda.cu b/maskrcnn_benchmark/csrc/cuda/deform_conv_kernel_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ee15810103a4edaf213abdb222a70249d622c0f9
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/deform_conv_kernel_cuda.cu
@@ -0,0 +1,874 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+
+#include
+#include
+#include
+#include
+#include
+
+using namespace at;
+
+#define CUDA_KERNEL_LOOP(i, n) \
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+ i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+const int kMaxGridNum = 65535;
+inline int GET_BLOCKS(const int N)
+{
+ return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS);
+}
+
+/*
+const int CUDA_NUM_THREADS = 1024;
+
+inline int GET_BLOCKS(const int N)
+{
+ return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}*/
+
+template
+__device__ scalar_t deformable_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
+ const int height, const int width, scalar_t h, scalar_t w)
+{
+
+ int h_low = floor(h);
+ int w_low = floor(w);
+ int h_high = h_low + 1;
+ int w_high = w_low + 1;
+
+ scalar_t lh = h - h_low;
+ scalar_t lw = w - w_low;
+ scalar_t hh = 1 - lh, hw = 1 - lw;
+
+ scalar_t v1 = 0;
+ if (h_low >= 0 && w_low >= 0)
+ v1 = bottom_data[h_low * data_width + w_low];
+ scalar_t v2 = 0;
+ if (h_low >= 0 && w_high <= width - 1)
+ v2 = bottom_data[h_low * data_width + w_high];
+ scalar_t v3 = 0;
+ if (h_high <= height - 1 && w_low >= 0)
+ v3 = bottom_data[h_high * data_width + w_low];
+ scalar_t v4 = 0;
+ if (h_high <= height - 1 && w_high <= width - 1)
+ v4 = bottom_data[h_high * data_width + w_high];
+
+ scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+ scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+ return val;
+}
+
+template
+__device__ scalar_t get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
+ const int h, const int w, const int height, const int width)
+{
+
+ if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+ {
+ //empty
+ return 0;
+ }
+
+ int argmax_h_low = floor(argmax_h);
+ int argmax_w_low = floor(argmax_w);
+ int argmax_h_high = argmax_h_low + 1;
+ int argmax_w_high = argmax_w_low + 1;
+
+ scalar_t weight = 0;
+ if (h == argmax_h_low && w == argmax_w_low)
+ weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+ if (h == argmax_h_low && w == argmax_w_high)
+ weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+ if (h == argmax_h_high && w == argmax_w_low)
+ weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+ if (h == argmax_h_high && w == argmax_w_high)
+ weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+ return weight;
+}
+
+template
+__device__ scalar_t get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
+ const int height, const int width, const scalar_t *im_data,
+ const int data_width, const int bp_dir)
+{
+
+ if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+ {
+ //empty
+ return 0;
+ }
+
+ int argmax_h_low = floor(argmax_h);
+ int argmax_w_low = floor(argmax_w);
+ int argmax_h_high = argmax_h_low + 1;
+ int argmax_w_high = argmax_w_low + 1;
+
+ scalar_t weight = 0;
+
+ if (bp_dir == 0)
+ {
+ if (argmax_h_low >= 0 && argmax_w_low >= 0)
+ weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+ if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+ weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+ if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+ weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+ if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+ weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+ }
+ else if (bp_dir == 1)
+ {
+ if (argmax_h_low >= 0 && argmax_w_low >= 0)
+ weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+ if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+ weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+ if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+ weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+ if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+ weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+ }
+
+ return weight;
+}
+
+template
+__global__ void deformable_im2col_gpu_kernel(const int n, const scalar_t *data_im, const scalar_t *data_offset,
+ const int height, const int width, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w, const int channel_per_deformable_group,
+ const int batch_size, const int num_channels, const int deformable_group,
+ const int height_col, const int width_col,
+ scalar_t *data_col)
+{
+ CUDA_KERNEL_LOOP(index, n)
+ {
+ // index index of output matrix
+ const int w_col = index % width_col;
+ const int h_col = (index / width_col) % height_col;
+ const int b_col = (index / width_col / height_col) % batch_size;
+ const int c_im = (index / width_col / height_col) / batch_size;
+ const int c_col = c_im * kernel_h * kernel_w;
+
+ // compute deformable group index
+ const int deformable_group_index = c_im / channel_per_deformable_group;
+
+ const int h_in = h_col * stride_h - pad_h;
+ const int w_in = w_col * stride_w - pad_w;
+ scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+ //const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+ const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+ const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+ for (int i = 0; i < kernel_h; ++i)
+ {
+ for (int j = 0; j < kernel_w; ++j)
+ {
+ const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+ const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+ const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+ const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+ scalar_t val = static_cast(0);
+ const scalar_t h_im = h_in + i * dilation_h + offset_h;
+ const scalar_t w_im = w_in + j * dilation_w + offset_w;
+ if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+ {
+ //const scalar_t map_h = i * dilation_h + offset_h;
+ //const scalar_t map_w = j * dilation_w + offset_w;
+ //const int cur_height = height - h_in;
+ //const int cur_width = width - w_in;
+ //val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+ val = deformable_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
+ }
+ *data_col_ptr = val;
+ data_col_ptr += batch_size * height_col * width_col;
+ }
+ }
+ }
+}
+
+void deformable_im2col(
+ const at::Tensor data_im, const at::Tensor data_offset, const int channels,
+ const int height, const int width, const int ksize_h, const int ksize_w,
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w, const int parallel_imgs,
+ const int deformable_group, at::Tensor data_col)
+{
+ // num_axes should be smaller than block size
+ // todo: check parallel_imgs is correctly passed in
+ int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+ int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+ int num_kernels = channels * height_col * width_col * parallel_imgs;
+ int channel_per_deformable_group = channels / deformable_group;
+
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
+ const scalar_t *data_im_ = data_im.data_ptr();
+ const scalar_t *data_offset_ = data_offset.data_ptr();
+ scalar_t *data_col_ = data_col.data_ptr();
+
+ deformable_im2col_gpu_kernel<<>>(
+ num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w,
+ pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+ channel_per_deformable_group, parallel_imgs, channels, deformable_group,
+ height_col, width_col, data_col_);
+ }));
+
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess)
+ {
+ printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
+ }
+}
+
+template
+__global__ void deformable_col2im_gpu_kernel(
+ const int n, const scalar_t *data_col, const scalar_t *data_offset,
+ const int channels, const int height, const int width,
+ const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int channel_per_deformable_group,
+ const int batch_size, const int deformable_group,
+ const int height_col, const int width_col,
+ scalar_t *grad_im)
+{
+ CUDA_KERNEL_LOOP(index, n)
+ {
+ const int j = (index / width_col / height_col / batch_size) % kernel_w;
+ const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+ const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+ // compute the start and end of the output
+
+ const int deformable_group_index = c / channel_per_deformable_group;
+
+ int w_out = index % width_col;
+ int h_out = (index / width_col) % height_col;
+ int b = (index / width_col / height_col) % batch_size;
+ int w_in = w_out * stride_w - pad_w;
+ int h_in = h_out * stride_h - pad_h;
+
+ const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) *
+ 2 * kernel_h * kernel_w * height_col * width_col;
+ const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+ const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+ const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+ const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+ const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+ const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+ const scalar_t cur_top_grad = data_col[index];
+ const int cur_h = (int)cur_inv_h_data;
+ const int cur_w = (int)cur_inv_w_data;
+ for (int dy = -2; dy <= 2; dy++)
+ {
+ for (int dx = -2; dx <= 2; dx++)
+ {
+ if (cur_h + dy >= 0 && cur_h + dy < height &&
+ cur_w + dx >= 0 && cur_w + dx < width &&
+ abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+ abs(cur_inv_w_data - (cur_w + dx)) < 1)
+ {
+ int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+ scalar_t weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+ atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+ }
+ }
+ }
+ }
+}
+
+void deformable_col2im(
+ const at::Tensor data_col, const at::Tensor data_offset, const int channels,
+ const int height, const int width, const int ksize_h,
+ const int ksize_w, const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int parallel_imgs, const int deformable_group,
+ at::Tensor grad_im)
+{
+
+ // todo: make sure parallel_imgs is passed in correctly
+ int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+ int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+ int num_kernels = channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+ int channel_per_deformable_group = channels / deformable_group;
+
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+ const scalar_t *data_col_ = data_col.data_ptr();
+ const scalar_t *data_offset_ = data_offset.data_ptr();
+ scalar_t *grad_im_ = grad_im.data_ptr();
+
+ deformable_col2im_gpu_kernel<<>>(
+ num_kernels, data_col_, data_offset_, channels, height, width, ksize_h,
+ ksize_w, pad_h, pad_w, stride_h, stride_w,
+ dilation_h, dilation_w, channel_per_deformable_group,
+ parallel_imgs, deformable_group, height_col, width_col, grad_im_);
+ }));
+
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess)
+ {
+ printf("error in deformable_col2im: %s\n", cudaGetErrorString(err));
+ }
+}
+
+template
+__global__ void deformable_col2im_coord_gpu_kernel(const int n, const scalar_t *data_col,
+ const scalar_t *data_im, const scalar_t *data_offset,
+ const int channels, const int height, const int width,
+ const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int channel_per_deformable_group,
+ const int batch_size, const int offset_channels, const int deformable_group,
+ const int height_col, const int width_col, scalar_t *grad_offset)
+{
+ CUDA_KERNEL_LOOP(index, n)
+ {
+ scalar_t val = 0;
+ int w = index % width_col;
+ int h = (index / width_col) % height_col;
+ int c = (index / width_col / height_col) % offset_channels;
+ int b = (index / width_col / height_col) / offset_channels;
+ // compute the start and end of the output
+
+ const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+ const int col_step = kernel_h * kernel_w;
+ int cnt = 0;
+ const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group *
+ batch_size * width_col * height_col;
+ const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) *
+ channel_per_deformable_group / kernel_h / kernel_w * height * width;
+ const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 *
+ kernel_h * kernel_w * height_col * width_col;
+
+ const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+ for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+ {
+ const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+ const int bp_dir = offset_c % 2;
+
+ int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+ int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+ int w_out = col_pos % width_col;
+ int h_out = (col_pos / width_col) % height_col;
+ int w_in = w_out * stride_w - pad_w;
+ int h_in = h_out * stride_h - pad_h;
+ const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+ const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+ const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+ const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+ scalar_t inv_h = h_in + i * dilation_h + offset_h;
+ scalar_t inv_w = w_in + j * dilation_w + offset_w;
+ if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+ {
+ inv_h = inv_w = -2;
+ }
+ const scalar_t weight = get_coordinate_weight(
+ inv_h, inv_w,
+ height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+ val += weight * data_col_ptr[col_pos];
+ cnt += 1;
+ }
+
+ grad_offset[index] = val;
+ }
+}
+
+void deformable_col2im_coord(
+ const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset,
+ const int channels, const int height, const int width, const int ksize_h,
+ const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
+ const int stride_w, const int dilation_h, const int dilation_w,
+ const int parallel_imgs, const int deformable_group, at::Tensor grad_offset)
+{
+
+ int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+ int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+ int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w * deformable_group * parallel_imgs;
+ int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group;
+
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
+ const scalar_t *data_col_ = data_col.data_ptr();
+ const scalar_t *data_im_ = data_im.data_ptr();
+ const scalar_t *data_offset_ = data_offset.data_ptr();
+ scalar_t *grad_offset_ = grad_offset.data_ptr();
+
+ deformable_col2im_coord_gpu_kernel<<>>(
+ num_kernels, data_col_, data_im_, data_offset_, channels, height, width,
+ ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+ dilation_h, dilation_w, channel_per_deformable_group,
+ parallel_imgs, 2 * ksize_h * ksize_w * deformable_group, deformable_group,
+ height_col, width_col, grad_offset_);
+ }));
+}
+
+template
+__device__ scalar_t dmcn_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
+ const int height, const int width, scalar_t h, scalar_t w)
+{
+ int h_low = floor(h);
+ int w_low = floor(w);
+ int h_high = h_low + 1;
+ int w_high = w_low + 1;
+
+ scalar_t lh = h - h_low;
+ scalar_t lw = w - w_low;
+ scalar_t hh = 1 - lh, hw = 1 - lw;
+
+ scalar_t v1 = 0;
+ if (h_low >= 0 && w_low >= 0)
+ v1 = bottom_data[h_low * data_width + w_low];
+ scalar_t v2 = 0;
+ if (h_low >= 0 && w_high <= width - 1)
+ v2 = bottom_data[h_low * data_width + w_high];
+ scalar_t v3 = 0;
+ if (h_high <= height - 1 && w_low >= 0)
+ v3 = bottom_data[h_high * data_width + w_low];
+ scalar_t v4 = 0;
+ if (h_high <= height - 1 && w_high <= width - 1)
+ v4 = bottom_data[h_high * data_width + w_high];
+
+ scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+ scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+ return val;
+}
+
+template
+__device__ scalar_t dmcn_get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
+ const int h, const int w, const int height, const int width)
+{
+ if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+ {
+ //empty
+ return 0;
+ }
+
+ int argmax_h_low = floor(argmax_h);
+ int argmax_w_low = floor(argmax_w);
+ int argmax_h_high = argmax_h_low + 1;
+ int argmax_w_high = argmax_w_low + 1;
+
+ scalar_t weight = 0;
+ if (h == argmax_h_low && w == argmax_w_low)
+ weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+ if (h == argmax_h_low && w == argmax_w_high)
+ weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+ if (h == argmax_h_high && w == argmax_w_low)
+ weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+ if (h == argmax_h_high && w == argmax_w_high)
+ weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+ return weight;
+}
+
+template
+__device__ scalar_t dmcn_get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
+ const int height, const int width, const scalar_t *im_data,
+ const int data_width, const int bp_dir)
+{
+ if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+ {
+ //empty
+ return 0;
+ }
+
+ int argmax_h_low = floor(argmax_h);
+ int argmax_w_low = floor(argmax_w);
+ int argmax_h_high = argmax_h_low + 1;
+ int argmax_w_high = argmax_w_low + 1;
+
+ scalar_t weight = 0;
+
+ if (bp_dir == 0)
+ {
+ if (argmax_h_low >= 0 && argmax_w_low >= 0)
+ weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+ if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+ weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+ if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+ weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+ if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+ weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+ }
+ else if (bp_dir == 1)
+ {
+ if (argmax_h_low >= 0 && argmax_w_low >= 0)
+ weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+ if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+ weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+ if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+ weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+ if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+ weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+ }
+
+ return weight;
+}
+
+template
+__global__ void modulated_deformable_im2col_gpu_kernel(const int n,
+ const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask,
+ const int height, const int width, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int channel_per_deformable_group,
+ const int batch_size, const int num_channels, const int deformable_group,
+ const int height_col, const int width_col,
+ scalar_t *data_col)
+{
+ CUDA_KERNEL_LOOP(index, n)
+ {
+ // index index of output matrix
+ const int w_col = index % width_col;
+ const int h_col = (index / width_col) % height_col;
+ const int b_col = (index / width_col / height_col) % batch_size;
+ const int c_im = (index / width_col / height_col) / batch_size;
+ const int c_col = c_im * kernel_h * kernel_w;
+
+ // compute deformable group index
+ const int deformable_group_index = c_im / channel_per_deformable_group;
+
+ const int h_in = h_col * stride_h - pad_h;
+ const int w_in = w_col * stride_w - pad_w;
+
+ scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+ //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+ const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+ const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+ const scalar_t *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+ for (int i = 0; i < kernel_h; ++i)
+ {
+ for (int j = 0; j < kernel_w; ++j)
+ {
+ const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+ const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+ const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+ const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+ const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+ const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+ scalar_t val = static_cast(0);
+ const scalar_t h_im = h_in + i * dilation_h + offset_h;
+ const scalar_t w_im = w_in + j * dilation_w + offset_w;
+ //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+ if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+ {
+ //const float map_h = i * dilation_h + offset_h;
+ //const float map_w = j * dilation_w + offset_w;
+ //const int cur_height = height - h_in;
+ //const int cur_width = width - w_in;
+ //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+ val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
+ }
+ *data_col_ptr = val * mask;
+ data_col_ptr += batch_size * height_col * width_col;
+ //data_col_ptr += height_col * width_col;
+ }
+ }
+ }
+}
+
+template
+__global__ void modulated_deformable_col2im_gpu_kernel(const int n,
+ const scalar_t *data_col, const scalar_t *data_offset, const scalar_t *data_mask,
+ const int channels, const int height, const int width,
+ const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int channel_per_deformable_group,
+ const int batch_size, const int deformable_group,
+ const int height_col, const int width_col,
+ scalar_t *grad_im)
+{
+ CUDA_KERNEL_LOOP(index, n)
+ {
+ const int j = (index / width_col / height_col / batch_size) % kernel_w;
+ const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+ const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+ // compute the start and end of the output
+
+ const int deformable_group_index = c / channel_per_deformable_group;
+
+ int w_out = index % width_col;
+ int h_out = (index / width_col) % height_col;
+ int b = (index / width_col / height_col) % batch_size;
+ int w_in = w_out * stride_w - pad_w;
+ int h_in = h_out * stride_h - pad_h;
+
+ const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+ const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+ const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+ const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+ const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+ const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+ const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+ const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+ const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+ const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+ const scalar_t cur_top_grad = data_col[index] * mask;
+ const int cur_h = (int)cur_inv_h_data;
+ const int cur_w = (int)cur_inv_w_data;
+ for (int dy = -2; dy <= 2; dy++)
+ {
+ for (int dx = -2; dx <= 2; dx++)
+ {
+ if (cur_h + dy >= 0 && cur_h + dy < height &&
+ cur_w + dx >= 0 && cur_w + dx < width &&
+ abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+ abs(cur_inv_w_data - (cur_w + dx)) < 1)
+ {
+ int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+ scalar_t weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+ atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+ }
+ }
+ }
+ }
+}
+
+template
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
+ const scalar_t *data_col, const scalar_t *data_im,
+ const scalar_t *data_offset, const scalar_t *data_mask,
+ const int channels, const int height, const int width,
+ const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int channel_per_deformable_group,
+ const int batch_size, const int offset_channels, const int deformable_group,
+ const int height_col, const int width_col,
+ scalar_t *grad_offset, scalar_t *grad_mask)
+{
+ CUDA_KERNEL_LOOP(index, n)
+ {
+ scalar_t val = 0, mval = 0;
+ int w = index % width_col;
+ int h = (index / width_col) % height_col;
+ int c = (index / width_col / height_col) % offset_channels;
+ int b = (index / width_col / height_col) / offset_channels;
+ // compute the start and end of the output
+
+ const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+ const int col_step = kernel_h * kernel_w;
+ int cnt = 0;
+ const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
+ const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
+ const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+ const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+ const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+ for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+ {
+ const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+ const int bp_dir = offset_c % 2;
+
+ int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+ int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+ int w_out = col_pos % width_col;
+ int h_out = (col_pos / width_col) % height_col;
+ int w_in = w_out * stride_w - pad_w;
+ int h_in = h_out * stride_h - pad_h;
+ const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+ const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+ const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+ const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+ const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+ const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+ scalar_t inv_h = h_in + i * dilation_h + offset_h;
+ scalar_t inv_w = w_in + j * dilation_w + offset_w;
+ if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+ {
+ inv_h = inv_w = -2;
+ }
+ else
+ {
+ mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
+ }
+ const scalar_t weight = dmcn_get_coordinate_weight(
+ inv_h, inv_w,
+ height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+ val += weight * data_col_ptr[col_pos] * mask;
+ cnt += 1;
+ }
+ // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+ grad_offset[index] = val;
+ if (offset_c % 2 == 0)
+ // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
+ grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
+ }
+}
+
+void modulated_deformable_im2col_cuda(
+ const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
+ const int batch_size, const int channels, const int height_im, const int width_im,
+ const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int deformable_group, at::Tensor data_col)
+{
+ // num_axes should be smaller than block size
+ const int channel_per_deformable_group = channels / deformable_group;
+ const int num_kernels = channels * batch_size * height_col * width_col;
+
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
+ const scalar_t *data_im_ = data_im.data_ptr();
+ const scalar_t *data_offset_ = data_offset.data_ptr();
+ const scalar_t *data_mask_ = data_mask.data_ptr();
+ scalar_t *data_col_ = data_col.data_ptr();
+
+ modulated_deformable_im2col_gpu_kernel<<>>(
+ num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w,
+ pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+ batch_size, channels, deformable_group, height_col, width_col, data_col_);
+ }));
+
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess)
+ {
+ printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+ }
+}
+
+void modulated_deformable_col2im_cuda(
+ const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask,
+ const int batch_size, const int channels, const int height_im, const int width_im,
+ const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int deformable_group, at::Tensor grad_im)
+{
+
+ const int channel_per_deformable_group = channels / deformable_group;
+ const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
+ const scalar_t *data_col_ = data_col.data_ptr();
+ const scalar_t *data_offset_ = data_offset.data_ptr();
+ const scalar_t *data_mask_ = data_mask.data_ptr();
+ scalar_t *grad_im_ = grad_im.data_ptr();
+
+ modulated_deformable_col2im_gpu_kernel<<>>(
+ num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im,
+ kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
+ dilation_h, dilation_w, channel_per_deformable_group,
+ batch_size, deformable_group, height_col, width_col, grad_im_);
+ }));
+
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess)
+ {
+ printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+ }
+}
+
+void modulated_deformable_col2im_coord_cuda(
+ const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
+ const int batch_size, const int channels, const int height_im, const int width_im,
+ const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int deformable_group,
+ at::Tensor grad_offset, at::Tensor grad_mask)
+{
+ const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
+ const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
+
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
+ const scalar_t *data_col_ = data_col.data_ptr();
+ const scalar_t *data_im_ = data_im.data_ptr();
+ const scalar_t *data_offset_ = data_offset.data_ptr();
+ const scalar_t *data_mask_ = data_mask.data_ptr();
+ scalar_t *grad_offset_ = grad_offset.data_ptr();
+ scalar_t *grad_mask_ = grad_mask.data_ptr();
+
+ modulated_deformable_col2im_coord_gpu_kernel<<>>(
+ num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im,
+ kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+ dilation_h, dilation_w, channel_per_deformable_group,
+ batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col,
+ grad_offset_, grad_mask_);
+ }));
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess)
+ {
+ printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
+ }
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/deform_pool_cuda.cu b/maskrcnn_benchmark/csrc/cuda/deform_pool_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bbe22d77b49be70f174ae3f17647b09968358255
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/deform_pool_cuda.cu
@@ -0,0 +1,87 @@
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c
+
+// based on
+// author: Charles Shang
+// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
+
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+#include
+
+
+void DeformablePSROIPoolForward(
+ const at::Tensor data, const at::Tensor bbox, const at::Tensor trans,
+ at::Tensor out, at::Tensor top_count, const int batch, const int channels,
+ const int height, const int width, const int num_bbox,
+ const int channels_trans, const int no_trans, const float spatial_scale,
+ const int output_dim, const int group_size, const int pooled_size,
+ const int part_size, const int sample_per_part, const float trans_std);
+
+void DeformablePSROIPoolBackwardAcc(
+ const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox,
+ const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad,
+ at::Tensor trans_grad, const int batch, const int channels,
+ const int height, const int width, const int num_bbox,
+ const int channels_trans, const int no_trans, const float spatial_scale,
+ const int output_dim, const int group_size, const int pooled_size,
+ const int part_size, const int sample_per_part, const float trans_std);
+
+void deform_psroi_pooling_cuda_forward(
+ at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out,
+ at::Tensor top_count, const int no_trans, const float spatial_scale,
+ const int output_dim, const int group_size, const int pooled_size,
+ const int part_size, const int sample_per_part, const float trans_std)
+{
+ TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+
+ const int batch = input.size(0);
+ const int channels = input.size(1);
+ const int height = input.size(2);
+ const int width = input.size(3);
+ const int channels_trans = no_trans ? 2 : trans.size(1);
+
+ const int num_bbox = bbox.size(0);
+ if (num_bbox != out.size(0))
+ AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
+ out.size(0), num_bbox);
+
+ DeformablePSROIPoolForward(
+ input, bbox, trans, out, top_count, batch, channels, height, width,
+ num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size,
+ pooled_size, part_size, sample_per_part, trans_std);
+}
+
+void deform_psroi_pooling_cuda_backward(
+ at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans,
+ at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad,
+ const int no_trans, const float spatial_scale, const int output_dim,
+ const int group_size, const int pooled_size, const int part_size,
+ const int sample_per_part, const float trans_std)
+{
+ TORCH_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
+ TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+
+ const int batch = input.size(0);
+ const int channels = input.size(1);
+ const int height = input.size(2);
+ const int width = input.size(3);
+ const int channels_trans = no_trans ? 2 : trans.size(1);
+
+ const int num_bbox = bbox.size(0);
+ if (num_bbox != out_grad.size(0))
+ AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
+ out_grad.size(0), num_bbox);
+
+ DeformablePSROIPoolBackwardAcc(
+ out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch,
+ channels, height, width, num_bbox, channels_trans, no_trans,
+ spatial_scale, output_dim, group_size, pooled_size, part_size,
+ sample_per_part, trans_std);
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/deform_pool_kernel_cuda.cu b/maskrcnn_benchmark/csrc/cuda/deform_pool_kernel_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3f6c4cb22f6ecbae242e21c9530f474e709c6e90
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/deform_pool_kernel_cuda.cu
@@ -0,0 +1,365 @@
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file deformable_psroi_pooling.cu
+ * \brief
+ * \author Yi Li, Guodong Zhang, Jifeng Dai
+*/
+/***************** Adapted by Charles Shang *********************/
+// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/cuda/deform_psroi_pooling_cuda.cu
+
+
+#include
+#include
+#include
+#include
+#include
+
+using namespace at;
+
+#define CUDA_KERNEL_LOOP(i, n) \
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+ i < (n); \
+ i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N)
+{
+ return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+template
+__device__ scalar_t bilinear_interp(
+ const scalar_t *data,
+ const scalar_t x,
+ const scalar_t y,
+ const int width,
+ const int height)
+{
+ int x1 = floor(x);
+ int x2 = ceil(x);
+ int y1 = floor(y);
+ int y2 = ceil(y);
+ scalar_t dist_x = (scalar_t)(x - x1);
+ scalar_t dist_y = (scalar_t)(y - y1);
+ scalar_t value11 = data[y1 * width + x1];
+ scalar_t value12 = data[y2 * width + x1];
+ scalar_t value21 = data[y1 * width + x2];
+ scalar_t value22 = data[y2 * width + x2];
+ scalar_t value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22;
+ return value;
+}
+
+template
+__global__ void DeformablePSROIPoolForwardKernel(
+ const int count,
+ const scalar_t *bottom_data,
+ const scalar_t spatial_scale,
+ const int channels,
+ const int height, const int width,
+ const int pooled_height, const int pooled_width,
+ const scalar_t *bottom_rois, const scalar_t *bottom_trans,
+ const int no_trans,
+ const scalar_t trans_std,
+ const int sample_per_part,
+ const int output_dim,
+ const int group_size,
+ const int part_size,
+ const int num_classes,
+ const int channels_each_class,
+ scalar_t *top_data,
+ scalar_t *top_count)
+{
+ CUDA_KERNEL_LOOP(index, count)
+ {
+ // The output is in order (n, ctop, ph, pw)
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int ctop = (index / pooled_width / pooled_height) % output_dim;
+ int n = index / pooled_width / pooled_height / output_dim;
+
+ // [start, end) interval for spatial sampling
+ const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+ int roi_batch_ind = offset_bottom_rois[0];
+ scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+ scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+ scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+ scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+ // Force too small ROIs to be 1x1
+ scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+ scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+ // Compute w and h at bottom
+ scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
+ scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
+
+ scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
+ scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
+
+ int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
+ int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
+ int class_id = ctop / channels_each_class;
+ scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+ scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+
+ scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
+ wstart += trans_x * roi_width;
+ scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
+ hstart += trans_y * roi_height;
+
+ scalar_t sum = 0;
+ int count = 0;
+ int gw = floor((scalar_t)(pw)*group_size / pooled_width);
+ int gh = floor((scalar_t)(ph)*group_size / pooled_height);
+ gw = min(max(gw, 0), group_size - 1);
+ gh = min(max(gh, 0), group_size - 1);
+
+ const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
+ for (int ih = 0; ih < sample_per_part; ih++)
+ {
+ for (int iw = 0; iw < sample_per_part; iw++)
+ {
+ scalar_t w = wstart + iw * sub_bin_size_w;
+ scalar_t h = hstart + ih * sub_bin_size_h;
+ // bilinear interpolation
+ if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+ {
+ continue;
+ }
+ w = min(max(w, 0.), width - 1.);
+ h = min(max(h, 0.), height - 1.);
+ int c = (ctop * group_size + gh) * group_size + gw;
+ scalar_t val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height);
+ sum += val;
+ count++;
+ }
+ }
+ top_data[index] = count == 0 ? (scalar_t)(0) : sum / count;
+ top_count[index] = count;
+ }
+}
+
+template
+__global__ void DeformablePSROIPoolBackwardAccKernel(
+ const int count,
+ const scalar_t *top_diff,
+ const scalar_t *top_count,
+ const int num_rois,
+ const scalar_t spatial_scale,
+ const int channels,
+ const int height, const int width,
+ const int pooled_height, const int pooled_width,
+ const int output_dim,
+ scalar_t *bottom_data_diff, scalar_t *bottom_trans_diff,
+ const scalar_t *bottom_data,
+ const scalar_t *bottom_rois,
+ const scalar_t *bottom_trans,
+ const int no_trans,
+ const scalar_t trans_std,
+ const int sample_per_part,
+ const int group_size,
+ const int part_size,
+ const int num_classes,
+ const int channels_each_class)
+{
+ CUDA_KERNEL_LOOP(index, count)
+ {
+ // The output is in order (n, ctop, ph, pw)
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int ctop = (index / pooled_width / pooled_height) % output_dim;
+ int n = index / pooled_width / pooled_height / output_dim;
+
+ // [start, end) interval for spatial sampling
+ const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+ int roi_batch_ind = offset_bottom_rois[0];
+ scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+ scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+ scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+ scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+ // Force too small ROIs to be 1x1
+ scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+ scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+ // Compute w and h at bottom
+ scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
+ scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
+
+ scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
+ scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
+
+ int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
+ int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
+ int class_id = ctop / channels_each_class;
+ scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+ scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+
+ scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
+ wstart += trans_x * roi_width;
+ scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
+ hstart += trans_y * roi_height;
+
+ if (top_count[index] <= 0)
+ {
+ continue;
+ }
+ scalar_t diff_val = top_diff[index] / top_count[index];
+ const scalar_t *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
+ scalar_t *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
+ int gw = floor((scalar_t)(pw)*group_size / pooled_width);
+ int gh = floor((scalar_t)(ph)*group_size / pooled_height);
+ gw = min(max(gw, 0), group_size - 1);
+ gh = min(max(gh, 0), group_size - 1);
+
+ for (int ih = 0; ih < sample_per_part; ih++)
+ {
+ for (int iw = 0; iw < sample_per_part; iw++)
+ {
+ scalar_t w = wstart + iw * sub_bin_size_w;
+ scalar_t h = hstart + ih * sub_bin_size_h;
+ // bilinear interpolation
+ if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+ {
+ continue;
+ }
+ w = min(max(w, 0.), width - 1.);
+ h = min(max(h, 0.), height - 1.);
+ int c = (ctop * group_size + gh) * group_size + gw;
+ // backward on feature
+ int x0 = floor(w);
+ int x1 = ceil(w);
+ int y0 = floor(h);
+ int y1 = ceil(h);
+ scalar_t dist_x = w - x0, dist_y = h - y0;
+ scalar_t q00 = (1 - dist_x) * (1 - dist_y);
+ scalar_t q01 = (1 - dist_x) * dist_y;
+ scalar_t q10 = dist_x * (1 - dist_y);
+ scalar_t q11 = dist_x * dist_y;
+ int bottom_index_base = c * height * width;
+ atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
+ atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
+ atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
+ atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
+
+ if (no_trans)
+ {
+ continue;
+ }
+ scalar_t U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
+ scalar_t U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
+ scalar_t U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
+ scalar_t U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
+ scalar_t diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
+ diff_x *= roi_width;
+ scalar_t diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
+ diff_y *= roi_height;
+
+ atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
+ atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
+ }
+ }
+ }
+}
+
+void DeformablePSROIPoolForward(const at::Tensor data,
+ const at::Tensor bbox,
+ const at::Tensor trans,
+ at::Tensor out,
+ at::Tensor top_count,
+ const int batch,
+ const int channels,
+ const int height,
+ const int width,
+ const int num_bbox,
+ const int channels_trans,
+ const int no_trans,
+ const float spatial_scale,
+ const int output_dim,
+ const int group_size,
+ const int pooled_size,
+ const int part_size,
+ const int sample_per_part,
+ const float trans_std)
+{
+ const int pooled_height = pooled_size;
+ const int pooled_width = pooled_size;
+ const int count = num_bbox * output_dim * pooled_height * pooled_width;
+ const int num_classes = no_trans ? 1 : channels_trans / 2;
+ const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ data.scalar_type(), "deformable_psroi_pool_forward", ([&] {
+ const scalar_t *bottom_data = data.data_ptr();
+ const scalar_t *bottom_rois = bbox.data_ptr();
+ const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr();
+ scalar_t *top_data = out.data_ptr();
+ scalar_t *top_count_data = top_count.data_ptr();
+
+ DeformablePSROIPoolForwardKernel<<>>(
+ count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width,
+ bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, output_dim,
+ group_size, part_size, num_classes, channels_each_class, top_data, top_count_data);
+ }));
+
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess)
+ {
+ printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
+ }
+}
+
+void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad,
+ const at::Tensor data,
+ const at::Tensor bbox,
+ const at::Tensor trans,
+ const at::Tensor top_count,
+ at::Tensor in_grad,
+ at::Tensor trans_grad,
+ const int batch,
+ const int channels,
+ const int height,
+ const int width,
+ const int num_bbox,
+ const int channels_trans,
+ const int no_trans,
+ const float spatial_scale,
+ const int output_dim,
+ const int group_size,
+ const int pooled_size,
+ const int part_size,
+ const int sample_per_part,
+ const float trans_std)
+{
+ // LOG(INFO) << "DeformablePSROIPoolBackward";
+ const int num_rois = num_bbox;
+ const int pooled_height = pooled_size;
+ const int pooled_width = pooled_size;
+ const int count = num_bbox * output_dim * pooled_height * pooled_width;
+ const int num_classes = no_trans ? 1 : channels_trans / 2;
+ const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ out_grad.scalar_type(), "deformable_psroi_pool_backward_acc", ([&] {
+ const scalar_t *top_diff = out_grad.data_ptr();
+ const scalar_t *bottom_data = data.data_ptr();
+ const scalar_t *bottom_rois = bbox.data_ptr();
+ const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr();
+ scalar_t *bottom_data_diff = in_grad.data_ptr();
+ scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data_ptr();
+ const scalar_t *top_count_data = top_count.data_ptr();
+
+ DeformablePSROIPoolBackwardAccKernel<<>>(
+ count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width,
+ pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff,
+ bottom_data, bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part,
+ group_size, part_size, num_classes, channels_each_class);
+ }));
+
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess)
+ {
+ printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
+ }
+}
\ No newline at end of file
diff --git a/maskrcnn_benchmark/csrc/cuda/ml_nms.cu b/maskrcnn_benchmark/csrc/cuda/ml_nms.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cd958a0899a9e3adc69ca053170beb2b34fbd8ef
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/ml_nms.cu
@@ -0,0 +1,136 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+
+__device__ inline float devIoU(float const * const a, float const * const b) {
+ if (a[5] != b[5]) {
+ return 0.0;
+ }
+ float left = max(a[0], b[0]), right = min(a[2], b[2]);
+ float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+ float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+ float interS = width * height;
+ float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+ float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+ return interS / (Sa + Sb - interS);
+}
+
+__global__ void ml_nms_kernel(const int n_boxes, const float nms_overlap_thresh,
+ const float *dev_boxes, unsigned long long *dev_mask) {
+ const int row_start = blockIdx.y;
+ const int col_start = blockIdx.x;
+
+ // if (row_start > col_start) return;
+
+ const int row_size =
+ min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+ const int col_size =
+ min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+ __shared__ float block_boxes[threadsPerBlock * 6];
+ if (threadIdx.x < col_size) {
+ block_boxes[threadIdx.x * 6 + 0] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
+ block_boxes[threadIdx.x * 6 + 1] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
+ block_boxes[threadIdx.x * 6 + 2] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
+ block_boxes[threadIdx.x * 6 + 3] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
+ block_boxes[threadIdx.x * 6 + 4] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
+ block_boxes[threadIdx.x * 6 + 5] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5];
+ }
+ __syncthreads();
+
+ if (threadIdx.x < row_size) {
+ const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+ const float *cur_box = dev_boxes + cur_box_idx * 6;
+ int i = 0;
+ unsigned long long t = 0;
+ int start = 0;
+ if (row_start == col_start) {
+ start = threadIdx.x + 1;
+ }
+ for (i = start; i < col_size; i++) {
+ if (devIoU(cur_box, block_boxes + i * 6) > nms_overlap_thresh) {
+ t |= 1ULL << i;
+ }
+ }
+ const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
+ dev_mask[cur_box_idx * col_blocks + col_start] = t;
+ }
+}
+
+// boxes is a N x 6 tensor
+at::Tensor ml_nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
+ using scalar_t = float;
+ AT_ASSERTM(boxes.device().is_cuda(), "boxes must be a CUDA tensor");
+ auto scores = boxes.select(1, 4);
+ auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+ auto boxes_sorted = boxes.index_select(0, order_t);
+
+ int boxes_num = boxes.size(0);
+
+ const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
+
+ scalar_t* boxes_dev = boxes_sorted.data_ptr();
+
+ THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
+
+ unsigned long long* mask_dev = NULL;
+ //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
+ // boxes_num * col_blocks * sizeof(unsigned long long)));
+
+ mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
+
+ dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
+ THCCeilDiv(boxes_num, threadsPerBlock));
+ dim3 threads(threadsPerBlock);
+ ml_nms_kernel<<>>(boxes_num,
+ nms_overlap_thresh,
+ boxes_dev,
+ mask_dev);
+
+ std::vector mask_host(boxes_num * col_blocks);
+ THCudaCheck(cudaMemcpy(&mask_host[0],
+ mask_dev,
+ sizeof(unsigned long long) * boxes_num * col_blocks,
+ cudaMemcpyDeviceToHost));
+
+ std::vector remv(col_blocks);
+ memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+ at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
+ int64_t* keep_out = keep.data_ptr();
+
+ int num_to_keep = 0;
+ for (int i = 0; i < boxes_num; i++) {
+ int nblock = i / threadsPerBlock;
+ int inblock = i % threadsPerBlock;
+
+ if (!(remv[nblock] & (1ULL << inblock))) {
+ keep_out[num_to_keep++] = i;
+ unsigned long long *p = &mask_host[0] + i * col_blocks;
+ for (int j = nblock; j < col_blocks; j++) {
+ remv[j] |= p[j];
+ }
+ }
+ }
+
+ THCudaFree(state, mask_dev);
+ // TODO improve this part
+ return std::get<0>(order_t.index({
+ keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
+ order_t.device(), keep.scalar_type())
+ }).sort(0, false));
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/nms.cu b/maskrcnn_benchmark/csrc/cuda/nms.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d6221b85fa8f6b40cf498b76d6dbfc3c8438e25e
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/nms.cu
@@ -0,0 +1,131 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+
+__device__ inline float devIoU(float const * const a, float const * const b) {
+ float left = max(a[0], b[0]), right = min(a[2], b[2]);
+ float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+ float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+ float interS = width * height;
+ float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+ float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+ return interS / (Sa + Sb - interS);
+}
+
+__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
+ const float *dev_boxes, unsigned long long *dev_mask) {
+ const int row_start = blockIdx.y;
+ const int col_start = blockIdx.x;
+
+ // if (row_start > col_start) return;
+
+ const int row_size =
+ min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+ const int col_size =
+ min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+ __shared__ float block_boxes[threadsPerBlock * 5];
+ if (threadIdx.x < col_size) {
+ block_boxes[threadIdx.x * 5 + 0] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+ block_boxes[threadIdx.x * 5 + 1] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+ block_boxes[threadIdx.x * 5 + 2] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+ block_boxes[threadIdx.x * 5 + 3] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+ block_boxes[threadIdx.x * 5 + 4] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+ }
+ __syncthreads();
+
+ if (threadIdx.x < row_size) {
+ const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+ const float *cur_box = dev_boxes + cur_box_idx * 5;
+ int i = 0;
+ unsigned long long t = 0;
+ int start = 0;
+ if (row_start == col_start) {
+ start = threadIdx.x + 1;
+ }
+ for (i = start; i < col_size; i++) {
+ if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+ t |= 1ULL << i;
+ }
+ }
+ const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
+ dev_mask[cur_box_idx * col_blocks + col_start] = t;
+ }
+}
+
+// boxes is a N x 5 tensor
+at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
+ using scalar_t = float;
+ AT_ASSERTM(boxes.device().is_cuda(), "boxes must be a CUDA tensor");
+ auto scores = boxes.select(1, 4);
+ auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+ auto boxes_sorted = boxes.index_select(0, order_t);
+
+ int boxes_num = boxes.size(0);
+
+ const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
+
+ scalar_t* boxes_dev = boxes_sorted.data_ptr();
+
+ THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
+
+ unsigned long long* mask_dev = NULL;
+ //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
+ // boxes_num * col_blocks * sizeof(unsigned long long)));
+
+ mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
+
+ dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
+ THCCeilDiv(boxes_num, threadsPerBlock));
+ dim3 threads(threadsPerBlock);
+ nms_kernel<<>>(boxes_num,
+ nms_overlap_thresh,
+ boxes_dev,
+ mask_dev);
+
+ std::vector mask_host(boxes_num * col_blocks);
+ THCudaCheck(cudaMemcpy(&mask_host[0],
+ mask_dev,
+ sizeof(unsigned long long) * boxes_num * col_blocks,
+ cudaMemcpyDeviceToHost));
+
+ std::vector