diff --git a/MODEL/desco_glip_tiny.pth b/MODEL/desco_glip_tiny.pth
new file mode 100644
index 0000000000000000000000000000000000000000..6b3d6733cf476ba61f835f1e2520ea7c51696424
--- /dev/null
+++ b/MODEL/desco_glip_tiny.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:199479f67b5fbd4ab5e232c8fa8df3e9ab42a96966a023524c6cd95710ea5192
+size 3707483035
diff --git a/app.py b/app.py
index 446e183143e52600d900e53aebb40656e7c8fed8..4467086da18c371685be675136168b7a1410813d 100644
--- a/app.py
+++ b/app.py
@@ -19,9 +19,12 @@ from maskrcnn_benchmark.config import cfg
from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo
# Use this command for evaluate the GLIP-T model
-config_file = "configs/pretrain/glip_Swin_T_O365_GoldG.yaml"
+#config_file = "configs/pretrain/glip_Swin_T_O365_GoldG.yaml"
#weight_file = "MODEL/glip_tiny_model_o365_goldg_cc_sbu.pth"
+config_file = "configs/pretrain_new/desco_glip.yaml"
+weight_file = "MODEL/desco_glip_tiny.pth"
+
# Use this command if you want to try the GLIP-L model
# ! wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/models/glip_large_model.pth -O MODEL/glip_large_model.pth
# config_file = "configs/pretrain/glip_Swin_L.yaml"
@@ -61,12 +64,12 @@ gr.Interface(
),
],
examples=[
- ["./flickr_9472793441.jpg", "bobble heads on top of the shelf ."],
- ["./flickr_9472793441.jpg", "sofa . remote . dog . person . car . sky . plane ."],
+ #["./flickr_9472793441.jpg", "bobble heads on top of the shelf ."],
+ #["./flickr_9472793441.jpg", "sofa . remote . dog . person . car . sky . plane ."],
["./coco_000000281759.jpg", "A green umbrella. A pink striped umbrella. A plain white umbrella."],
["./coco_000000281759.jpg", "a flowery top. A blue dress. An orange shirt ."],
["./coco_000000281759.jpg", "a car . An electricity box ."],
- ["./flickr_7520721.jpg", "A woman figure skater in a blue costume holds her leg by the blade of her skate ."]
+ #["./flickr_7520721.jpg", "A woman figure skater in a blue costume holds her leg by the blade of her skate ."]
],
article=Path("docs/intro.md").read_text()
).launch()
diff --git a/coco_000000281759.jpg b/coco_000000281759.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a4c8ce4d4a3c89eaa0b667cdb1af7cf066442a28
--- /dev/null
+++ b/coco_000000281759.jpg
@@ -0,0 +1,275 @@
+
+
+
This is the HuggingFace Gradio Demo for GLIP. The model requires an image, and a text to be the inputs. The text input can either be a natural sentence description (grounding), or a simple concatenation of some random categories (object detection).
+
The paper presents a grounded language-image pre-training (GLIP) model for learning object-level, language-aware, and semantic-rich visual representations. GLIP unifies object detection and phrase grounding for pre-training. The unification brings two benefits: 1) it allows GLIP to learn from both detection and grounding data to improve both tasks and bootstrap a good grounding model; 2) GLIP can leverage massive image-text pairs by generating grounding boxes in a self-training fashion, making the learned representation semantic-rich.
News: We are also holding an ODinW challenge at the CV in the Wild Workshop @ ECCV 2022. We hope our open-source code encourage the community to participate in this challenge!
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/maskrcnn_benchmark/__init__.py b/maskrcnn_benchmark/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bc96c7a6bf8379e1adfb3e4adf536107b385fa9
--- /dev/null
+++ b/maskrcnn_benchmark/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
diff --git a/maskrcnn_benchmark/config/__init__.py b/maskrcnn_benchmark/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..81363c8df5f903a254414861a854116899fc8bbe
--- /dev/null
+++ b/maskrcnn_benchmark/config/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from .defaults import _C as cfg
+from .paths_catalog import try_to_find
diff --git a/maskrcnn_benchmark/config/defaults.py b/maskrcnn_benchmark/config/defaults.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fab60edc7dac56db4861666a951d7155c386be0
--- /dev/null
+++ b/maskrcnn_benchmark/config/defaults.py
@@ -0,0 +1,982 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import os
+
+from yacs.config import CfgNode as CN
+
+# -----------------------------------------------------------------------------
+# Convention about Training / Test specific parameters
+# -----------------------------------------------------------------------------
+# Whenever an argument can be either used for training or for testing, the
+# corresponding name will be post-fixed by a _TRAIN for a training parameter,
+# or _TEST for a test-specific parameter.
+# For example, the number of images during training will be
+# IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be
+# IMAGES_PER_BATCH_TEST
+
+# -----------------------------------------------------------------------------
+# Config definition
+# -----------------------------------------------------------------------------
+
+_C = CN()
+
+_C.MODEL = CN()
+_C.MODEL.RPN_ONLY = False
+_C.MODEL.BOX_ON = True
+_C.MODEL.MASK_ON = False
+_C.MODEL.KEYPOINT_ON = False
+_C.MODEL.DEVICE = "cuda"
+
+_C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN"
+
+_C.MODEL.RPN_ARCHITECTURE = "RPN"
+_C.MODEL.DEBUG = False # add debug flag
+_C.MODEL.ONNX = False # add onnx flag
+
+# If the WEIGHT starts with a catalog://, like :R-50, the code will look for
+# the path in paths_catalog. Else, it will use it as the specified absolute
+# path
+_C.MODEL.WEIGHT = ""
+_C.MODEL.PRETRAIN_NAME = ""
+
+# If LINEAR_PROB = True, only the last linear layers in rpn and roi_head are trainable
+_C.MODEL.LINEAR_PROB = False
+
+# -----------------------------------------------------------------------------
+# Multitask Training / Test specific parameters
+# -----------------------------------------------------------------------------
+_C.MODEL.MULTITASK = CN(new_allowed=True)
+
+# -----------------------------------------------------------------------------
+# INPUT
+# -----------------------------------------------------------------------------
+_C.INPUT = CN()
+# Size of the smallest side of the image during training
+_C.INPUT.MIN_SIZE_TRAIN = 800 # (800,)
+# Maximum size of the side of the image during training
+_C.INPUT.MAX_SIZE_TRAIN = 1333
+# Size of the smallest side of the image during testing
+_C.INPUT.MIN_SIZE_TEST = 800
+# Maximum size of the side of the image during testing
+_C.INPUT.MAX_SIZE_TEST = 1333
+# Values to be used for image normalization
+_C.INPUT.PIXEL_MEAN = [102.9801, 115.9465, 122.7717]
+# Values to be used for image normalization
+_C.INPUT.PIXEL_STD = [1.0, 1.0, 1.0]
+# Convert image to BGR format (for Caffe2 models), in range 0-255
+_C.INPUT.TO_BGR255 = True
+_C.INPUT.FORMAT = ""
+_C.INPUT.FIX_RES = False
+
+# -----------------------------------------------------------------------------
+# Augmentation
+# -----------------------------------------------------------------------------
+_C.AUGMENT = CN()
+_C.AUGMENT.USE_RA = 0
+_C.AUGMENT.FLIP_PROB_TRAIN = 0.5
+_C.AUGMENT.VERTICAL_FLIP_PROB_TRAIN = 0.0
+_C.AUGMENT.MULT_MIN_SIZE_TRAIN = ()
+
+_C.AUGMENT.BRIGHTNESS = 0.0
+_C.AUGMENT.CONTRAST = 0.0
+_C.AUGMENT.SATURATION = 0.0
+_C.AUGMENT.HUE = 0.0
+
+_C.AUGMENT.CROP_PROB = 0.5
+_C.AUGMENT.CROP_MIN_IOUS = (0.1, 0.3, 0.5, 0.7, 0.9)
+_C.AUGMENT.CROP_MIN_SIZE = 0.3
+
+_C.AUGMENT.AFFINE_PROB = 0.5
+_C.AUGMENT.AFFINE_R = (-10, 10)
+_C.AUGMENT.AFFINE_T = (0.1, 0.1)
+_C.AUGMENT.AFFINE_S = (0.9, 1.1)
+_C.AUGMENT.AFFINE_SHEAR = (-2, 2)
+_C.AUGMENT.AFFINE_FILL = (127.5, 127.5, 127.5)
+
+_C.AUGMENT.ERASE_PROB = 0.0
+_C.AUGMENT.ERASE_L = 0.02
+_C.AUGMENT.ERASE_H = 1 / 3
+_C.AUGMENT.ERASE_MIN_ASPECT = 0.3
+_C.AUGMENT.ERASE_MODE = "const"
+_C.AUGMENT.ERASE_MAX_COUNT = 1
+_C.AUGMENT.ERASE_MAX_OVERLAP = 0.6
+_C.AUGMENT.ERASE_MAX_VALUE = 255
+
+_C.AUGMENT.MOSAIC_PROB = 0.0
+_C.AUGMENT.MOSAIC_SHIFT = 0.5
+_C.AUGMENT.MOSAIC_SIZE = -1
+
+_C.AUGMENT.PASTE_PROB = 0.0
+_C.AUGMENT.PASTE_CAT = ()
+_C.AUGMENT.PASTE_NUM = 2
+# -----------------------------------------------------------------------------
+# Dataset
+# -----------------------------------------------------------------------------
+_C.DATASETS = CN()
+# List of the dataset names for training, as present in paths_catalog.py
+_C.DATASETS.TRAIN = ()
+# List of the dataset names for testing, as present in paths_catalog.py
+_C.DATASETS.TEST = ()
+# Use is_crowd label
+_C.DATASETS.USE_CROWD = False
+_C.DATASETS.CLASS_AGNOSTIC = False
+_C.DATASETS.CLASS_CONCAT = False
+_C.DATASETS.MAX_BOX = -1
+_C.DATASETS.SAMPLE_RATIO = 0.0
+_C.DATASETS.FEW_SHOT = 0
+# SHUFFLE_SEED != 0 means shuffle the dataset in the few shot setting
+_C.DATASETS.SHUFFLE_SEED = 0
+_C.DATASETS.PREDEFINED_TEXT = ""
+_C.DATASETS.ALTERNATIVE_TRAINING = False
+_C.DATASETS.MULTISTAGE_TRAINING = False
+_C.DATASETS.REGISTER = CN(new_allowed=True)
+_C.DATASETS.BOX_THRESHOLD = 0.1
+# Duplicate Dataset
+_C.DATASETS.COCO_COPY = 1
+_C.DATASETS.LVIS_COPY = 1
+_C.DATASETS.FLICKR_COPY = 1
+_C.DATASETS.MIXED_COPY = 1
+_C.DATASETS.OBJECT365_COPY = 1
+_C.DATASETS.VG_COPY = 1
+_C.DATASETS.OI_COPY = 1
+_C.DATASETS.IN_COPY = 1
+_C.DATASETS.MIXED_GPT_COPY = 1
+
+# Duplicate Dataset
+_C.DATASETS.COCO_COPY = 1
+_C.DATASETS.FLICKR_COPY = 1
+_C.DATASETS.MIXED_COPY = 1
+_C.DATASETS.OBJECT365_COPY = 1
+_C.DATASETS.VG_COPY = 1
+_C.DATASETS.OI_COPY = 1
+_C.DATASETS.IN_COPY = 1
+_C.DATASETS.REFCOCO_COPY = 1
+_C.DATASETS.GENERAL_COPY = -1
+_C.DATASETS.GENERAL_COPY_TEST = -1
+
+# OD to Grounding
+_C.DATASETS.RANDOM_SAMPLE_NEG = -1
+_C.DATASETS.ADD_DET_PROMPT = False
+_C.DATASETS.ADD_DET_PROMPT_ADVANCED = False
+_C.DATASETS.USE_OD_AUG = False
+_C.DATASETS.USE_COCO_FORMAT = False
+_C.DATASETS.CONTROL_PROB = ()
+_C.DATASETS.DISABLE_SHUFFLE = False
+_C.DATASETS.PROMPT_VERSION = ""
+_C.DATASETS.PROMPT_LIMIT_NEG = -1
+_C.DATASETS.POS_QUESTION_PROB = 0.6
+_C.DATASETS.NEG_QUESTION_PROB = 0.8
+_C.DATASETS.FULL_QUESTION_PROB = 0.5
+_C.DATASETS.ONE_HOT = False
+_C.DATASETS.NO_MINUS_ONE_FOR_ONE_HOT = False
+
+_C.DATASETS.DISABLE_CLIP_TO_IMAGE = False
+_C.DATASETS.SEPARATION_TOKENS = " "
+
+# LVIS
+_C.DATASETS.LVIS_USE_NORMAL_AP = False
+_C.DATASETS.LVIS_TOPK = 10000
+_C.DATASETS.SPECIAL_SAFEGUARD_FOR_COCO_GROUNDING = False
+
+# Caption
+_C.DATASETS.BING_INDEX_LIST = []
+_C.DATASETS.CAPTION_MIN_BOX = 1
+_C.DATASETS.REPLACE_CLEAN_LABEL = False
+_C.DATASETS.FURTHER_SCREEN = False
+_C.DATASETS.CAPTION_CONF = 0.9
+_C.DATASETS.CAPTION_NMS = 0.9
+_C.DATASETS.PACK_RANDOM_CAPTION_NUMBER = 0
+_C.DATASETS.INFERENCE_CAPTION = False
+_C.DATASETS.SAMPLE_NEGATIVE_FOR_GROUNDING_DATA = -1.0
+_C.DATASETS.RANDOM_PACK_PROB = -1.0
+_C.DATASETS.NO_RANDOM_PACK_PROBABILITY = 0.0
+_C.DATASETS.SAFEGUARD_POSITIVE_CAPTION = True
+_C.DATASETS.CAPTION_FORMAT_VERSION = "v1"
+_C.DATASETS.LOCAL_DEBUG = False
+
+
+# Od in the wild
+_C.DATASETS.PREDEFINED_TEXT = None
+_C.DATASETS.TRAIN_DATASETNAME_SUFFIX = ""
+_C.DATASETS.TEST_DATASETNAME_SUFFIX = ""
+_C.DATASETS.OVERRIDE_CATEGORY = None
+_C.DATASETS.USE_OVERRIDE_CATEGORY = False
+_C.DATASETS.SUPRESS_QUERY = None
+_C.DATASETS.USE_SUPRESS_QUERY = False
+_C.DATASETS.USE_CAPTION_PROMPT = False
+_C.DATASETS.CAPTION_PROMPT = None
+
+_C.DATASETS.PREDOWNLOAD_BING = False
+_C.DATASETS.PREDOWNLOAD_WITH_AZCOPY = False
+_C.DATASETS.FLICKR_GT_TYPE = "separate"
+
+# PACO
+_C.DATASETS.OD_TO_GROUNDING_VERSION = "legacy"
+
+# description
+_C.DATASETS.DESCRIPTION_FILE = None
+_C.DATASETS.SIMILARITY_FILE = None
+_C.DATASETS.CAPTION_VOCAB_FILE = None
+
+# caption augmentation
+_C.DATASETS.CAPTION_AUGMENTATION_VOCAB = None
+_C.DATASETS.CAPTION_AUGMENTATION_VERSION = None
+
+_C.DATASETS.CC_CAPTION_AUGMENTATION_VERSION = None
+
+_C.DATASETS.KEEP_NOUN_RATIO = 0.0
+
+# VQA
+_C.DATASETS.DIVER_BOX_FOR_VQA = False
+
+# -----------------------------------------------------------------------------
+# DataLoader
+# -----------------------------------------------------------------------------
+_C.DATALOADER = CN()
+# Number of data loading threads
+_C.DATALOADER.NUM_WORKERS = 4
+# If > 0, this enforces that each collated batch should have a size divisible
+# by SIZE_DIVISIBILITY
+_C.DATALOADER.SIZE_DIVISIBILITY = 0
+# If True, each batch should contain only images for which the aspect ratio
+# is compatible. This groups portrait images together, and landscape images
+# are not batched with portrait images.
+_C.DATALOADER.ASPECT_RATIO_GROUPING = True
+# Define min number of keypoints required from GT, for example 10 out of 17
+_C.DATALOADER.MIN_KPS_PER_IMS = 0
+# Use random sampler during training
+_C.DATALOADER.USE_RANDOM_SEED = False
+
+_C.DATALOADER.DISTRIBUTE_CHUNK_AMONG_NODE = False
+# ---------------------------------------------------------------------------- #
+# Backbone options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.BACKBONE = CN()
+
+# The backbone conv body to use
+# The string must match a function that is imported in modeling.model_builder
+# (e.g., 'FPN.add_fpn_ResNet101_conv5_body' to specify a ResNet-101-FPN
+# backbone)
+_C.MODEL.BACKBONE.CONV_BODY = "R-50-C4"
+
+# Add StopGrad at a specified stage so the bottom layers are frozen
+_C.MODEL.BACKBONE.FREEZE_CONV_BODY_AT = 2
+_C.MODEL.BACKBONE.FREEZE = False
+_C.MODEL.BACKBONE.GROUP = 1
+_C.MODEL.BACKBONE.OUT_CHANNELS = 256 * 4
+# Option to reset bn running statics
+_C.MODEL.BACKBONE.RESET_BN = False
+# Backbone Normalization Level
+_C.MODEL.BACKBONE.NORM_LEVEL = 3
+# BN for backbone
+_C.MODEL.BACKBONE.USE_BN = False
+# Sync BN for backbone
+_C.MODEL.BACKBONE.USE_SYNCBN = False
+_C.MODEL.BACKBONE.USE_NSYNCBN = False
+# GN for backbone
+_C.MODEL.BACKBONE.USE_GN = False
+# Evo Norm for backbone
+_C.MODEL.BACKBONE.USE_EN = False
+# Layers for backbone
+_C.MODEL.BACKBONE.USE_DFCONV = False
+_C.MODEL.BACKBONE.USE_DYRELU = False
+_C.MODEL.BACKBONE.USE_SE = False
+_C.MODEL.BACKBONE.LAYER_SETUP = (3, 4, 6, 3)
+_C.MODEL.BACKBONE.LAYER_SEARCH = CN(new_allowed=True)
+_C.MODEL.BACKBONE.OUT_FEATURES = ("stage2", "stage3", "stage4", "stage5")
+_C.MODEL.BACKBONE.FPN_LAYER = ()
+_C.MODEL.BACKBONE.USE_CHECKPOINT = False
+# Add JF efficient det cfgs
+_C.MODEL.BACKBONE.EFFICIENT_DET_START_FROM = 3
+_C.MODEL.BACKBONE.EFFICIENT_DET_COMPOUND = 0
+_C.MODEL.BACKBONE.EFFICIENT_DET_BIFPN_VERSION = 0
+
+_C.MODEL.BACKBONE.FUSION_VERSION = "v1" # Whether to use symmetric or non symmetric fusion
+
+_C.MODEL.LANGUAGE_BACKBONE = CN()
+_C.MODEL.LANGUAGE_BACKBONE.WEIGHT = ""
+_C.MODEL.LANGUAGE_BACKBONE.FREEZE = False
+_C.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT = False
+_C.MODEL.LANGUAGE_BACKBONE.TOKENIZER_TYPE = "bert-base-uncased"
+_C.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE = "bert-base-uncased"
+_C.MODEL.LANGUAGE_BACKBONE.LANG_DIM = 768
+_C.MODEL.LANGUAGE_BACKBONE.MAX_QUERY_LEN = 256
+_C.MODEL.LANGUAGE_BACKBONE.N_LAYERS = 1
+_C.MODEL.LANGUAGE_BACKBONE.UNUSED_TOKEN = 106
+_C.MODEL.LANGUAGE_BACKBONE.MASK_SPECIAL = False
+
+_C.MODEL.LANGUAGE_BACKBONE.RNN_TYPE = "lstm"
+_C.MODEL.LANGUAGE_BACKBONE.VARIABLE_LENGTH = True
+_C.MODEL.LANGUAGE_BACKBONE.WORD_EMBEDDING_SIZE = 512
+_C.MODEL.LANGUAGE_BACKBONE.WORD_VEC_SIZE = 512
+_C.MODEL.LANGUAGE_BACKBONE.HIDDEN_SIZE = 512
+_C.MODEL.LANGUAGE_BACKBONE.BIDIRECTIONAL = True
+_C.MODEL.LANGUAGE_BACKBONE.INPUT_DROPOUT_P = 0.5
+_C.MODEL.LANGUAGE_BACKBONE.DROPOUT_P = 0.2
+_C.MODEL.LANGUAGE_BACKBONE.CORPUS_PATH = ""
+_C.MODEL.LANGUAGE_BACKBONE.VOCAB_SIZE = 0
+
+_C.MODEL.LANGUAGE_BACKBONE.PAD_MAX = True
+# ---------------------------------------------------------------------------- #
+# FPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.FPN = CN()
+_C.MODEL.FPN.FREEZE = False
+_C.MODEL.FPN.USE_GN = False
+_C.MODEL.FPN.USE_RELU = False
+_C.MODEL.FPN.USE_DYRELU = False
+_C.MODEL.FPN.DROP_BLOCK = True
+_C.MODEL.FPN.DROP_PROB = 0.3
+_C.MODEL.FPN.DROP_SIZE = 3
+_C.MODEL.FPN.USE_SPP = False
+_C.MODEL.FPN.USE_PAN = False
+_C.MODEL.FPN.USE_DYHEAD = False
+_C.MODEL.FPN.RETURN_SWINT_FEATURE_BEFORE_FUSION = False
+# ---------------------------------------------------------------------------- #
+# BIFPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.BIFPN = CN()
+_C.MODEL.BIFPN.NUM_REPEATS = 1
+_C.MODEL.BIFPN.USE_ATTENTION = True
+
+# ---------------------------------------------------------------------------- #
+# Group Norm options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.GROUP_NORM = CN()
+# Number of dimensions per group in GroupNorm (-1 if using NUM_GROUPS)
+_C.MODEL.GROUP_NORM.DIM_PER_GP = -1
+# Number of groups in GroupNorm (-1 if using DIM_PER_GP)
+_C.MODEL.GROUP_NORM.NUM_GROUPS = 16
+# GroupNorm's small constant in the denominator
+_C.MODEL.GROUP_NORM.EPSILON = 1e-5
+
+# ---------------------------------------------------------------------------- #
+# Evo Norm options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.EVO_NORM = CN()
+# Number of groups in EvoNorm (-1 if using DIM_PER_GP)
+_C.MODEL.EVO_NORM.NUM_GROUPS = 8
+# EvoNorm's small constant in the denominator
+_C.MODEL.EVO_NORM.EPSILON = 1e-5
+
+# ---------------------------------------------------------------------------- #
+# RetinaNet Options (Follow the Detectron version)
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RETINANET = CN()
+# This is the number of foreground classes and background.
+_C.MODEL.RETINANET.NUM_CLASSES = 81
+# Convolutions to use in the cls and bbox tower
+# NOTE: this doesn't include the last conv for logits
+_C.MODEL.RETINANET.NUM_CONVS = 4
+# During inference, #locs to select based on cls score before NMS is performed
+# per FPN level
+_C.MODEL.RETINANET.PRE_NMS_TOP_N = 1000
+# Prior prob for the positives at the beginning of training. This is used to set
+# the bias init for the logits layer
+_C.MODEL.RETINANET.PRIOR_PROB = 0.01
+# Inference cls score threshold, anchors with score > INFERENCE_TH are
+# considered for inference
+_C.MODEL.RETINANET.INFERENCE_TH = 0.05
+# NMS threshold used in RetinaNet
+_C.MODEL.RETINANET.NMS_TH = 0.4
+_C.MODEL.RETINANET.DETECTIONS_PER_IMG = 100
+
+# ---------------------------------------------------------------------------- #
+# Focal Loss Options (Follow the Detectron version)
+# ---------------------------------------------------------------------------- #
+_C.MODEL.FOCAL = CN()
+# Weight for bbox_regression loss
+_C.MODEL.FOCAL.BBOX_REG_WEIGHT = 4.0
+# Smooth L1 loss beta for bbox regression
+_C.MODEL.FOCAL.BBOX_REG_BETA = 0.11
+# IoU overlap ratio for labeling an anchor as positive
+# Anchors with >= iou overlap are labeled positive
+_C.MODEL.FOCAL.FG_IOU_THRESHOLD = 0.5
+# IoU overlap ratio for labeling an anchor as negative
+# Anchors with < iou overlap are labeled negative
+_C.MODEL.FOCAL.BG_IOU_THRESHOLD = 0.4
+# Focal loss parameter: alpha
+_C.MODEL.FOCAL.LOSS_ALPHA = 0.25
+# Focal loss parameter: gamma
+_C.MODEL.FOCAL.LOSS_GAMMA = 2.0
+
+# ---------------------------------------------------------------------------- #
+# FCOS Options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.FCOS = CN()
+_C.MODEL.FCOS.NUM_CLASSES = 81 # the number of classes including background
+_C.MODEL.FCOS.FPN_STRIDES = [8, 16, 32, 64, 128]
+_C.MODEL.FCOS.PRIOR_PROB = 0.01
+_C.MODEL.FCOS.INFERENCE_TH = 0.05
+_C.MODEL.FCOS.NMS_TH = 0.6
+_C.MODEL.FCOS.PRE_NMS_TOP_N = 1000
+
+# the number of convolutions used in the cls and bbox tower
+_C.MODEL.FCOS.NUM_CONVS = 4
+# if use deformable conv to align features
+_C.MODEL.FCOS.USE_DFCONV = False
+
+# if CENTER_SAMPLING_RADIUS <= 0, it will disable center sampling
+_C.MODEL.FCOS.CENTER_SAMPLING_RADIUS = 0.0
+# IOU_LOSS_TYPE can be "iou", "linear_iou" or "giou"
+_C.MODEL.FCOS.IOU_LOSS_TYPE = "iou"
+
+_C.MODEL.FCOS.NORM_REG_TARGETS = False
+_C.MODEL.FCOS.CENTERNESS_ON_REG = False
+_C.MODEL.FCOS.USE_GT_CENTER = False
+
+_C.MODEL.FCOS.DETECTIONS_PER_IMG = 100
+_C.MODEL.FCOS.USE_GN = False
+_C.MODEL.FCOS.USE_BN = False
+
+_C.MODEL.FCOS.INFERENCE_TH_TRAIN = 0.0
+_C.MODEL.FCOS.PRE_NMS_TOP_N_TRAIN = 3000
+_C.MODEL.FCOS.POST_NMS_TOP_N_TRAIN = 1000
+
+# ---------------------------------------------------------------------------- #
+# ATSS Options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ATSS = CN()
+_C.MODEL.ATSS.NUM_CLASSES = 81 # the number of classes including background
+_C.MODEL.ATSS.PRIOR_PROB = 0.01
+_C.MODEL.ATSS.INFERENCE_TH = 0.05
+_C.MODEL.ATSS.NMS_TH = 0.6
+_C.MODEL.ATSS.PRE_NMS_TOP_N = 1000
+
+# the number of convolutions used in the cls and bbox tower
+_C.MODEL.ATSS.NUM_CONVS = 4
+# the channels of convolutions used in the cls and bbox tower
+_C.MODEL.ATSS.CHANNELS = 128
+# if use deformable conv to align features
+_C.MODEL.ATSS.USE_DFCONV = False
+
+# topk for selecting candidate positive samples from each level
+_C.MODEL.ATSS.TOPK = 9
+
+# Weight for bbox_regression loss
+_C.MODEL.ATSS.REG_LOSS_WEIGHT = 2.0
+
+_C.MODEL.ATSS.DETECTIONS_PER_IMG = 100
+_C.MODEL.ATSS.USE_GN = False
+_C.MODEL.ATSS.USE_BN = False
+
+_C.MODEL.ATSS.USE_DYRELU = False
+_C.MODEL.ATSS.USE_SE = False
+
+_C.MODEL.ATSS.INFERENCE_TH_TRAIN = 0.0
+_C.MODEL.ATSS.PRE_NMS_TOP_N_TRAIN = 3000
+_C.MODEL.ATSS.POST_NMS_TOP_N_TRAIN = 1000
+# ---------------------------------------------------------------------------- #
+# DYHEAD Options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.DYHEAD = CN()
+_C.MODEL.DYHEAD.NUM_CLASSES = 81 # the number of classes including background
+_C.MODEL.DYHEAD.PRIOR_PROB = 0.01
+
+# the number of convolutions used in the cls and bbox tower
+_C.MODEL.DYHEAD.NUM_CONVS = 4
+# the channels of convolutions used in the cls and bbox tower
+_C.MODEL.DYHEAD.CHANNELS = 128
+_C.MODEL.DYHEAD.GROUPS = 1
+# if use deformable conv to align features
+_C.MODEL.DYHEAD.USE_DFCONV = False
+
+# topk for selecting candidate positive samples from each level
+_C.MODEL.DYHEAD.TOPK = 9
+
+_C.MODEL.DYHEAD.SCORE_AGG = "MEAN" # MEAN or MAX, for binary focal loss score aggregation
+
+_C.MODEL.DYHEAD.LOG_SCALE = 0.0 # temperature (dot product)
+_C.MODEL.DYHEAD.SHALLOW_LOG_SCALE = 0.0 # # temperature (shallow contrastive)
+
+_C.MODEL.DYHEAD.USE_GN = False
+_C.MODEL.DYHEAD.USE_NSYNCBN = False
+_C.MODEL.DYHEAD.USE_SYNCBN = False
+
+_C.MODEL.DYHEAD.USE_DYFUSE = False
+_C.MODEL.DYHEAD.USE_DYRELU = False
+
+_C.MODEL.DYHEAD.CONV_FUNC = ""
+
+# CosineSimOutputLayers: https://github.com/ucbdrive/few-shot-object-detection/blob/master/fsdet/modeling/roi_heads/fast_rcnn.py#L448-L464
+_C.MODEL.DYHEAD.COSINE_SCALE = -1.0
+
+_C.MODEL.DYHEAD.FUSE_CONFIG = CN()
+_C.MODEL.DYHEAD.FUSE_CONFIG.EARLY_FUSE_ON = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.TYPE = ""
+_C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_EMB_SIZE = 256
+_C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_OUT_SIZE = 256
+_C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_EMB_DROPOUT = 0.1
+_C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_MLP_LAYERS = 2
+
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_CLASSIFICATION_LOSS = False
+
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_TOKEN_LOSS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.TOKEN_LOSS_WEIGHT = 1.0
+_C.MODEL.DYHEAD.FUSE_CONFIG.TOKEN_GAMMA = 2.0
+_C.MODEL.DYHEAD.FUSE_CONFIG.TOKEN_ALPHA = 0.25
+
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_DOT_PRODUCT_TOKEN_LOSS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_CONTRASTIVE_ALIGN_LOSS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.CONTRASTIVE_HIDDEN_DIM = 64
+_C.MODEL.DYHEAD.FUSE_CONFIG.CONTRASTIVE_ALIGN_LOSS_WEIGHT = 1.0
+_C.MODEL.DYHEAD.FUSE_CONFIG.DOT_PRODUCT_TOKEN_LOSS_WEIGHT = 1.0
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_LAYER_SCALE = True
+_C.MODEL.DYHEAD.FUSE_CONFIG.SEPARATE_BIDIRECTIONAL = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.STABLE_SOFTMAX_2D = False
+
+_C.MODEL.DYHEAD.FUSE_CONFIG.DO_LANG_PROJ_OUTSIDE_CHECKPOINT = False
+
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_FUSED_FEATURES_DOT_PRODUCT = False
+
+# Controls for
+_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_MIN_FOR_UNDERFLOW = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_MAX_FOR_OVERFLOW = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_BERTATTN_MIN_FOR_UNDERFLOW = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_BERTATTN_MAX_FOR_OVERFLOW = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_DOT_PRODUCT = False
+
+# MLM Loss
+_C.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS_FOR_ONLY_POSITIVES = True
+_C.MODEL.DYHEAD.FUSE_CONFIG.NO_MASK_FOR_OD = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.NO_MASK_FOR_GOLD = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS_COEF = 1.0
+_C.MODEL.DYHEAD.FUSE_CONFIG.MLM_OBJ_FOR_ONLY_POSITIVE = False
+
+# Shallow Contrastive Loss (FPN)
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_SHALLOW_CONTRASTIVE_LOSS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.SHALLOW_MAX_POSITIVE_ANCHORS = 100
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_SHALLOW_ZERO_PADS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.SHALLOW_CONTRASTIVE_HIDDEN_DIM = 64
+_C.MODEL.DYHEAD.FUSE_CONFIG.SHALLOW_CONTRASTIVE_LOSS_WEIGHT = 1.0
+
+# Span Loss
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_SPAN_LOSS = False # will reuse the green light span field to indicate span boundary
+_C.MODEL.DYHEAD.FUSE_CONFIG.SPAN_VERSION = None
+_C.MODEL.DYHEAD.FUSE_CONFIG.MUTE_NOOBJ_TOKEN = False
+
+
+# Shallow Contrastive Loss (BACKBONE)
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_BACKBONE_SHALLOW_CONTRASTIVE_LOSS = False
+
+_C.MODEL.DYHEAD.FUSE_CONFIG.ADD_LINEAR_LAYER = False
+# Mute non-essential tokens
+_C.MODEL.DYHEAD.FUSE_CONFIG.MUTE_NON_ESSENTIAL_TOKENS = False
+# use checkpoint to save memory
+_C.MODEL.DYHEAD.USE_CHECKPOINT = False
+
+# ---------------------------------------------------------------------------- #
+# DYDETR Options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.DYDETR = CN()
+_C.MODEL.DYDETR.NHEADS = 8
+_C.MODEL.DYDETR.DROPOUT = 0.0
+_C.MODEL.DYDETR.DIM_FEEDFORWARD = 2048
+_C.MODEL.DYDETR.ACTIVATION = "relu"
+_C.MODEL.DYDETR.HIDDEN_DIM = 256
+_C.MODEL.DYDETR.NUM_CLS = 1
+_C.MODEL.DYDETR.NUM_REG = 3
+_C.MODEL.DYDETR.NUM_HEADS = 6
+_C.MODEL.DYDETR.NUM_CLASSES = 81
+_C.MODEL.DYDETR.NUM_PROPOSALS = 300
+
+# Dynamic Conv.
+_C.MODEL.DYDETR.NUM_DYNAMIC = 2
+_C.MODEL.DYDETR.DIM_DYNAMIC = 64
+
+# Loss.
+_C.MODEL.DYDETR.CLASS_WEIGHT = 2.0
+_C.MODEL.DYDETR.GIOU_WEIGHT = 2.0
+_C.MODEL.DYDETR.L1_WEIGHT = 5.0
+_C.MODEL.DYDETR.DEEP_SUPERVISION = True
+_C.MODEL.DYDETR.NO_OBJECT_WEIGHT = 0.1
+
+# Focal Loss.
+_C.MODEL.DYDETR.USE_FOCAL = True
+_C.MODEL.DYDETR.ALPHA = 0.25
+_C.MODEL.DYDETR.GAMMA = 2.0
+_C.MODEL.DYDETR.PRIOR_PROB = 0.01
+
+_C.MODEL.DYDETR.APPEND_BOX = False
+
+# GROUNDING RELATED
+_C.MODEL.DYDETR.INCLUDE_LANGUAGE_DECODER = False
+_C.MODEL.DYDETR.USE_DOT_PRODUCT_TOKEN_LOSS = False
+_C.MODEL.DYDETR.LOG_SCALE = 0.0 # temperature
+_C.MODEL.DYDETR.RESET_PARAMETERS = True
+_C.MODEL.DYDETR.USE_GROUNDING_MATCHER_SETCRITERION = False
+_C.MODEL.DYDETR.MDETR_PLAIN_INFERENCE = False
+_C.MODEL.DYDETR.OVERRIDE_LANGUAGE_MODEL_FOR_TOKEN_LOSS = False
+_C.MODEL.DYDETR.NORMALIZE_PER_BOX = False
+_C.MODEL.DYDETR.RESET_SKIP_DOT_PRODUCT_WEIGHTS = False
+_C.MODEL.DYDETR.DEBUG = False
+_C.MODEL.DYDETR.AGGREGATE_METHOD = "MEAN"
+_C.MODEL.DYDETR.EARLY_FUSE_ON = False
+_C.MODEL.DYDETR.DYTOWER_ON = False
+_C.MODEL.DYDETR.USE_FUSED_LANGUAGE_FEATURES = True
+# ---------------------------------------------------------------------------- #
+# RPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RPN = CN()
+_C.MODEL.RPN.USE_FPN = False
+# Base RPN anchor sizes given in absolute pixels w.r.t. the scaled network input
+_C.MODEL.RPN.ANCHOR_SIZES = (32, 64, 128, 256, 512)
+# Stride of the feature map that RPN is attached.
+# For FPN, number of strides should match number of scales
+_C.MODEL.RPN.ANCHOR_STRIDE = (16,)
+# RPN anchor aspect ratios
+_C.MODEL.RPN.ASPECT_RATIOS = (0.5, 1.0, 2.0)
+# Anchor shift away ration from the center for r,t,l,d
+_C.MODEL.RPN.ANCHOR_SHIFT = (0.0, 0.0, 0.0, 0.0)
+# Use center to decide anchor size
+_C.MODEL.RPN.USE_RELATIVE_SIZE = False
+# Remove RPN anchors that go outside the image by RPN_STRADDLE_THRESH pixels
+# Set to -1 or a large value, e.g. 100000, to disable pruning anchors
+_C.MODEL.RPN.STRADDLE_THRESH = 0
+# Anchor scales per octave for complex anchors
+_C.MODEL.RPN.OCTAVE = 2.0
+_C.MODEL.RPN.SCALES_PER_OCTAVE = 3
+# Minimum overlap required between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD
+# ==> positive RPN example)
+_C.MODEL.RPN.FG_IOU_THRESHOLD = 0.7
+# Maximum overlap allowed between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD
+# ==> negative RPN example)
+_C.MODEL.RPN.BG_IOU_THRESHOLD = 0.3
+# Total number of RPN examples per image
+_C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256
+# Target fraction of foreground (positive) examples per RPN minibatch
+_C.MODEL.RPN.POSITIVE_FRACTION = 0.5
+# Number of top scoring RPN proposals to keep before applying NMS
+# When FPN is used, this is *per FPN level* (not total)
+_C.MODEL.RPN.PRE_NMS_TOP_N_TRAIN = 12000
+_C.MODEL.RPN.PRE_NMS_TOP_N_TEST = 6000
+# Number of top scoring RPN proposals to keep after applying NMS
+_C.MODEL.RPN.POST_NMS_TOP_N_TRAIN = 2000
+_C.MODEL.RPN.POST_NMS_TOP_N_TEST = 1000
+# NMS threshold used on RPN proposals
+_C.MODEL.RPN.NMS_THRESH = 0.7
+# Proposal height and width both need to be greater than RPN_MIN_SIZE
+# (a the scale used during training or inference)
+_C.MODEL.RPN.MIN_SIZE = 0
+# Number of top scoring RPN proposals to keep after combining proposals from
+# all FPN levels
+_C.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN = 2000
+_C.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 2000
+# Custom rpn head, empty to use default conv or separable conv
+_C.MODEL.RPN.RPN_HEAD = "SingleConvRPNHead"
+_C.MODEL.RPN.FREEZE = False
+_C.MODEL.RPN.FORCE_BOXES = False
+_C.MODEL.RPN.RETURN_FUSED_FEATURES = False
+
+# ---------------------------------------------------------------------------- #
+# ROI HEADS options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_HEADS = CN()
+_C.MODEL.ROI_HEADS.USE_FPN = False
+# Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD)
+_C.MODEL.ROI_HEADS.FG_IOU_THRESHOLD = 0.5
+# Overlap threshold for an RoI to be considered background
+# (class = 0 if overlap in [0, BG_IOU_THRESHOLD))
+_C.MODEL.ROI_HEADS.BG_IOU_THRESHOLD = 0.5
+# Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets
+# These are empirically chosen to approximately lead to unit variance targets
+_C.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS = (10.0, 10.0, 5.0, 5.0)
+# RoI minibatch size *per image* (number of regions of interest [ROIs])
+# Total number of RoIs per training minibatch =
+# TRAIN.BATCH_SIZE_PER_IM * TRAIN.IMS_PER_BATCH * NUM_GPUS
+# E.g., a common configuration is: 512 * 2 * 8 = 8192
+_C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
+# Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0)
+_C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25
+
+# Only used on test mode
+
+# Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to
+# balance obtaining high recall with not having too many low precision
+# detections that will slow down inference post processing steps (like NMS)
+_C.MODEL.ROI_HEADS.SCORE_THRESH = 0.05
+# Overlap threshold used for non-maximum suppression (suppress boxes with
+# IoU >= this threshold)
+_C.MODEL.ROI_HEADS.NMS = 0.5
+# Maximum number of detections to return per image (100 is based on the limit
+# established for the COCO dataset)
+_C.MODEL.ROI_HEADS.DETECTIONS_PER_IMG = 100
+
+_C.MODEL.ROI_BOX_HEAD = CN()
+_C.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor"
+_C.MODEL.ROI_BOX_HEAD.PREDICTOR = "FastRCNNPredictor"
+_C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_BOX_HEAD.POOLER_SCALES = (1.0 / 16,)
+_C.MODEL.ROI_BOX_HEAD.NUM_CLASSES = 81
+# Hidden layer dimension when using an MLP for the RoI box head
+_C.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM = 1024
+# GN
+_C.MODEL.ROI_BOX_HEAD.USE_GN = False
+# Dilation
+_C.MODEL.ROI_BOX_HEAD.DILATION = 1
+_C.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM = 256
+_C.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS = 4
+# Use D2 style ROIAlignV2
+_C.MODEL.ROI_BOX_HEAD.POOLER_ALIGNED = False
+
+_C.MODEL.ROI_MASK_HEAD = CN()
+_C.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor"
+_C.MODEL.ROI_MASK_HEAD.PREDICTOR = "MaskRCNNC4Predictor"
+_C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_MASK_HEAD.POOLER_SCALES = (1.0 / 16,)
+_C.MODEL.ROI_MASK_HEAD.MLP_HEAD_DIM = 1024
+_C.MODEL.ROI_MASK_HEAD.CONV_LAYERS = (256, 256, 256, 256)
+_C.MODEL.ROI_MASK_HEAD.RESOLUTION = 14
+_C.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True
+# Whether or not resize and translate masks to the input image.
+_C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS = False
+_C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD = 0.5
+# Dilation
+_C.MODEL.ROI_MASK_HEAD.DILATION = 1
+# GN
+_C.MODEL.ROI_MASK_HEAD.USE_GN = False
+# HG
+_C.MODEL.ROI_MASK_HEAD.HG_SCALE = 1
+
+_C.MODEL.ROI_KEYPOINT_HEAD = CN()
+_C.MODEL.ROI_KEYPOINT_HEAD.FEATURE_EXTRACTOR = "KeypointRCNNFeatureExtractor"
+_C.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR = "KeypointRCNNPredictor"
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SCALES = (1.0 / 16,)
+_C.MODEL.ROI_KEYPOINT_HEAD.MLP_HEAD_DIM = 1024
+_C.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS = tuple(512 for _ in range(8))
+_C.MODEL.ROI_KEYPOINT_HEAD.RESOLUTION = 14
+_C.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES = 17
+_C.MODEL.ROI_KEYPOINT_HEAD.KEYPOINT_NAME = () # If left empty, use default names
+_C.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True
+
+# ---------------------------------------------------------------------------- #
+# ResNe[X]t options (ResNets = {ResNet, ResNeXt}
+# Note that parts of a resnet may be used for both the backbone and the head
+# These options apply to both
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RESNETS = CN()
+
+_C.MODEL.RESNETS.USE_STEM3X3 = False
+_C.MODEL.RESNETS.WITH_SE = False
+_C.MODEL.RESNETS.USE_AVG_DOWN = False
+
+# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
+_C.MODEL.RESNETS.NUM_GROUPS = 1
+
+# Baseline width of each group
+_C.MODEL.RESNETS.WIDTH_PER_GROUP = 64
+
+# Place the stride 2 conv on the 1x1 filter
+# Use True only for the original MSRA ResNet; use False for C2 and Torch models
+_C.MODEL.RESNETS.STRIDE_IN_1X1 = True
+
+# Residual transformation function
+_C.MODEL.RESNETS.TRANS_FUNC = "BottleneckWithFixedBatchNorm"
+# ResNet's stem function (conv1 and pool1)
+_C.MODEL.RESNETS.STEM_FUNC = "StemWithFixedBatchNorm"
+
+# Apply dilation in stage "res5"
+_C.MODEL.RESNETS.RES5_DILATION = 1
+
+_C.MODEL.RESNETS.BACKBONE_OUT_CHANNELS = 256 * 4
+_C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
+_C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64
+
+_C.MODEL.RESNETS.REVISION = "resnet_light"
+# Deformable convolutions
+_C.MODEL.RESNETS.STAGE_WITH_DCN = (False, False, False, False)
+_C.MODEL.RESNETS.WITH_MODULATED_DCN = False
+_C.MODEL.RESNETS.DEFORMABLE_GROUPS = 1
+
+# ---------------------------------------------------------------------------- #
+# Swin Transformer
+# ---------------------------------------------------------------------------- #
+_C.MODEL.SWINT = CN()
+_C.MODEL.SWINT.EMBED_DIM = 96
+_C.MODEL.SWINT.OUT_CHANNELS = (96, 192, 384, 768)
+_C.MODEL.SWINT.DEPTHS = (2, 2, 6, 2)
+_C.MODEL.SWINT.NUM_HEADS = (3, 6, 12, 24)
+_C.MODEL.SWINT.WINDOW_SIZE = 7
+_C.MODEL.SWINT.MLP_RATIO = 4
+_C.MODEL.SWINT.DROP_PATH_RATE = 0.2
+_C.MODEL.SWINT.APE = False
+_C.MODEL.SWINT.VERSION = "v1"
+_C.MODEL.SWINT.OUT_NORM = True
+_C.MODEL.SWINT.LAYER_SCALE = 0
+
+# ---------------------------------------------------------------------------- #
+# CVT SPEC
+# ---------------------------------------------------------------------------- #
+_C.MODEL.SPEC = CN(new_allowed=True)
+
+# ---------------------------------------------------------------------------- #
+# CLIP SPEC
+# ---------------------------------------------------------------------------- #
+_C.MODEL.CLIP = CN()
+_C.MODEL.CLIP.CONTEXT_LENGTH = 256 # default 77
+_C.MODEL.CLIP.WIDTH = 512
+_C.MODEL.CLIP.LAYERS = 12
+_C.MODEL.CLIP.HEADS = 8
+_C.MODEL.CLIP.DROP_PATH = 0.0
+_C.MODEL.CLIP.TOKENIZER = "clip"
+_C.MODEL.CLIP.VOCAB_SIZE = 49408
+
+# ---------------------------------------------------------------------------- #
+# SEARCH
+# ---------------------------------------------------------------------------- #
+
+_C.SEARCH = CN()
+_C.SEARCH.MAX_EPOCH = 20
+_C.SEARCH.SELECT_NUM = 20
+_C.SEARCH.POPULATION_NUM = 64
+_C.SEARCH.MUTATION_NUM = 24
+_C.SEARCH.CROSSOVER_NUM = 24
+_C.SEARCH.MUTATION_PROB = 0.1
+
+# ---------------------------------------------------------------------------- #
+# Solver
+# ---------------------------------------------------------------------------- #
+_C.SOLVER = CN()
+_C.SOLVER.USE_AMP = False
+
+_C.SOLVER.MAX_ITER = 40000
+_C.SOLVER.MULTI_MAX_ITER = () # set different max epoch for different stage
+_C.SOLVER.MAX_EPOCH = 0 # any epoch number>0 will overwrite max_iter
+_C.SOLVER.MULTI_MAX_EPOCH = () # set different max epoch for different stage
+
+_C.SOLVER.OPTIMIZER = "SGD" # "ADAMW"
+
+_C.SOLVER.BASE_LR = 0.001
+
+_C.SOLVER.LANG_LR = 0.00001
+_C.SOLVER.BACKBONE_BODY_LR_FACTOR = 1.0
+_C.SOLVER.FUSION_LR_FACTOR = 1.0
+
+
+_C.SOLVER.BIAS_LR_FACTOR = 2
+_C.SOLVER.GRAD_CLIP = 0.0
+# D2 gradient clip
+_C.SOLVER.CLIP_GRADIENTS = CN()
+_C.SOLVER.CLIP_GRADIENTS.ENABLED = False
+_C.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 0.0
+_C.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "full_model"
+_C.SOLVER.CLIP_GRADIENTS.NORM_TYPE = 2.0
+_C.SOLVER.MODEL_EMA = 0.0
+
+_C.SOLVER.MOMENTUM = 0.9
+
+_C.SOLVER.WEIGHT_DECAY = 0.0005
+_C.SOLVER.WEIGHT_DECAY_BIAS = 0.0
+_C.SOLVER.WEIGHT_DECAY_NORM_FACTOR = 1.0
+_C.SOLVER.WEIGHT_DECAY_HEAD_FACTOR = 1.0
+
+# use cosine lr to replace default multistage
+_C.SOLVER.USE_COSINE = False
+_C.SOLVER.MIN_LR = 0.000001
+
+_C.SOLVER.GAMMA = 0.1
+_C.SOLVER.STEPS = (30000,)
+
+_C.SOLVER.USE_AUTOSTEP = False
+_C.SOLVER.STEP_PATIENCE = 5
+
+_C.SOLVER.WARMUP_FACTOR = 1.0 / 3
+_C.SOLVER.WARMUP_ITERS = 500
+_C.SOLVER.WARMUP_METHOD = "linear"
+
+_C.SOLVER.CHECKPOINT_PERIOD = 2500
+_C.SOLVER.CHECKPOINT_PER_EPOCH = -1.0
+_C.SOLVER.TEST_WITH_INFERENCE = False
+_C.SOLVER.AUTO_TERMINATE_PATIENCE = -1
+# Number of images per batch
+# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
+# see 2 images per batch
+_C.SOLVER.IMS_PER_BATCH = 16
+# This is the max negative ratio allowed per batch
+_C.SOLVER.MAX_NEG_PER_BATCH = 0.1
+
+_C.SOLVER.SEED = 0
+_C.SOLVER.DISABLE_OUTPUT_DISTRIBUTED = False
+
+
+_C.SOLVER.PROMPT_PROBING_LEVEL = -1.0
+# -1 means tuning the whole model;
+# 1 means tuning the whole language model; 1.5 means tuning the box head as well
+
+_C.SOLVER.FIND_UNUSED_PARAMETERS = True
+_C.SOLVER.DATASET_LENGTH = -1 # Just for logging purpose
+_C.SOLVER.TUNING_HIGHLEVEL_OVERRIDE = None
+_C.SOLVER.USE_EMA_FOR_MONITOR = False
+
+_C.SOLVER.WEIGHT_DECAY_SCHEDULE = False
+_C.SOLVER.WEIGHT_DECAY_SCHEDULE_RATIO = 0.667
+_C.SOLVER.RESUME_SKIP_SCHEDULE = False # when we resume from a checkpoint, we can skip
+
+# ---------------------------------------------------------------------------- #
+# Specific test options
+# ---------------------------------------------------------------------------- #
+_C.TEST = CN()
+_C.TEST.EXPECTED_RESULTS = []
+_C.TEST.EXPECTED_RESULTS_SIGMA_TOL = 4
+_C.TEST.DURING_TRAINING = False
+# Number of images per batch
+# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
+# see 2 images per batch
+_C.TEST.IMS_PER_BATCH = 16
+# Special Test Configuration
+_C.TEST.USE_MULTISCALE = False
+# _C.TEST.SCALES = (400, 600, 800, 1000, 1200, 1400)
+# _C.TEST.RANGES = ((96, 10000), (64, 10000), (0, 10000), (0, 10000), (0, 256), (0, 192))
+_C.TEST.SCALES = (400, 500, 600, 640, 700, 900, 1000, 1100, 1200, 1300, 1400, 1800)
+_C.TEST.RANGES = (
+ (96, 10000),
+ (96, 10000),
+ (64, 10000),
+ (64, 10000),
+ (64, 10000),
+ (0, 10000),
+ (0, 10000),
+ (0, 256),
+ (0, 256),
+ (0, 192),
+ (0, 192),
+ (0, 96),
+)
+_C.TEST.MAX_SIZE = 2500
+_C.TEST.FLIP = True
+_C.TEST.SPECIAL_NMS = "none" # ('none', 'soft-nms', 'vote', 'soft-vote')
+_C.TEST.TH = 0.6 # threshold for nms or vote
+_C.TEST.PRE_NMS_TOP_N = 1000
+_C.TEST.NUM_CLASSES = 81
+_C.TEST.SELECT_CLASSES = ()
+
+_C.TEST.EVAL_TASK = ""
+_C.TEST.SUBSET = -1
+_C.TEST.CHUNKED_EVALUATION = -1
+_C.TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM = -1
+_C.TEST.CHUNK_METHOD = "random" # or similar
+_C.TEST.CHUNK_INFERENCE_VERSION = "v1" # v2: modify the ATSS inference code slightly to make
+# ---------------------------------------------------------------------------- #
+# Misc options
+# ---------------------------------------------------------------------------- #
+_C.OUTPUT_DIR = "OUTPUT"
+
+_C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py")
+
+# TensorBoard experiment location
+_C.TENSORBOARD_EXP = "OUTPUT"
+
+_C.GLIPKNOW = CN()
+_C.GLIPKNOW.KNOWLEDGE_FILE = ""
+_C.GLIPKNOW.KNOWLEDGE_TYPE = ""
+_C.GLIPKNOW.MAX_NUM_CLASSES_PER_BATCH_TRAIN = -1
+_C.GLIPKNOW.PARALLEL_LANGUAGE_INPUT = False
+_C.GLIPKNOW.LAN_FEATURE_AGG_TYPE = "first"
+_C.GLIPKNOW.GPT3_NUM = 5
+_C.GLIPKNOW.WIKI_AND_GPT3 = False
\ No newline at end of file
diff --git a/maskrcnn_benchmark/config/paths_catalog.py b/maskrcnn_benchmark/config/paths_catalog.py
new file mode 100644
index 0000000000000000000000000000000000000000..04f128aaa53370898557d9a43e1dd9dffbc1f2ad
--- /dev/null
+++ b/maskrcnn_benchmark/config/paths_catalog.py
@@ -0,0 +1,779 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""Centralized catalog of paths."""
+
+import os
+
+
+def try_to_find(file, return_dir=False, search_path=["./DATASET", "./OUTPUT", "./data", "./MODEL"]):
+ if not file:
+ return file
+
+ if file.startswith("catalog://"):
+ return file
+
+ DATASET_PATH = ["./"]
+ if "DATASET" in os.environ:
+ DATASET_PATH.append(os.environ["DATASET"])
+ DATASET_PATH += search_path
+
+ for path in DATASET_PATH:
+ if os.path.exists(os.path.join(path, file)):
+ if return_dir:
+ return path
+ else:
+ return os.path.join(path, file)
+
+ print("Cannot find {} in {}".format(file, DATASET_PATH))
+ exit(1)
+
+
+class DatasetCatalog(object):
+ DATASETS = {
+ # pretrained grounding dataset
+ # mixed vg and coco
+ "mixed_train": {
+ "coco_img_dir": "coco/train2014",
+ "vg_img_dir": "gqa/images",
+ "ann_file": "mdetr_annotations/final_mixed_train.json",
+ },
+ "mixed_train_no_coco": {
+ "coco_img_dir": "coco/train2014",
+ "vg_img_dir": "gqa/images",
+ "ann_file": "mdetr_annotations/final_mixed_train_no_coco.json",
+ },
+ # flickr30k
+ "flickr30k_train": {
+ "img_folder": "flickr30k/flickr30k_images/train",
+ "ann_file": "mdetr_annotations/final_flickr_separateGT_train.json",
+ "is_train": True,
+ },
+ "flickr30k_val": {
+ "img_folder": "flickr30k/flickr30k_images/val",
+ "ann_file": "mdetr_annotations/final_flickr_separateGT_val.json",
+ "is_train": False,
+ },
+ "flickr30k_test": {
+ "img_folder": "flickr30k/flickr30k_images/test",
+ "ann_file": "mdetr_annotations/final_flickr_separateGT_test.json",
+ "is_train": False,
+ },
+ # refcoco
+ "refexp_all_val": {
+ "img_dir": "coco/train2014",
+ "ann_file": "mdetr_annotations/final_refexp_val.json",
+ "is_train": False,
+ },
+ "refcoco_train": {
+ "img_dir": "coco/train2014",
+ "ann_file": "mdetr_annotations/finetune_refcoco_train.json",
+ "is_train": True,
+ },
+ "refcoco_val": {
+ "img_dir": "coco/train2014",
+ "ann_file": "mdetr_annotations/finetune_refcoco_val.json",
+ "is_train": False,
+ },
+ "refcoco_real_val": {
+ "img_dir": "coco/train2014",
+ "ann_file": "mdetr_annotations/finetune_refcoco_val.json",
+ "is_train": False,
+ },
+ "refcoco_testA": {
+ "img_dir": "coco/train2014",
+ "ann_file": "mdetr_annotations/finetune_refcoco_testA.json",
+ "is_train": False,
+ },
+ "refcoco_testB": {
+ "img_dir": "coco/train2014",
+ "ann_file": "mdetr_annotations/finetune_refcoco_testB.json",
+ "is_train": False,
+ },
+ "refcoco+_train": {
+ "img_dir": "coco/train2014",
+ "ann_file": "mdetr_annotations/finetune_refcoco+_train.json",
+ "is_train": True,
+ },
+ "refcoco+_val": {
+ "img_dir": "coco/train2014",
+ "ann_file": "mdetr_annotations/finetune_refcoco+_val.json",
+ "is_train": False,
+ },
+ "refcoco+_testA": {
+ "img_dir": "coco/train2014",
+ "ann_file": "mdetr_annotations/finetune_refcoco+_testA.json",
+ "is_train": False,
+ },
+ "refcoco+_testB": {
+ "img_dir": "coco/train2014",
+ "ann_file": "mdetr_annotations/finetune_refcoco+_testB.json",
+ "is_train": False,
+ },
+ "refcocog_train": {
+ "img_dir": "coco/train2014",
+ "ann_file": "mdetr_annotations/finetune_refcocog_train.json",
+ "is_train": True,
+ },
+ "refcocog_val": {
+ "img_dir": "coco/train2014",
+ "ann_file": "mdetr_annotations/finetune_refcocog_val.json",
+ "is_train": False,
+ },
+ "refcocog_test": {
+ "img_dir": "coco/train2014",
+ "ann_file": "mdetr_annotations/finetune_refcocog_test_corrected.json",
+ "is_train": False,
+ },
+ # gqa
+ "gqa_val": {"img_dir": "gqa/images", "ann_file": "mdetr_annotations/final_gqa_val.json", "is_train": False},
+ # phrasecut
+ "phrasecut_train": {
+ "img_dir": "gqa/images",
+ "ann_file": "mdetr_annotations/finetune_phrasecut_train.json",
+ "is_train": True,
+ },
+ # caption
+ "bing_caption_train": {
+ "yaml_path": "BingData/predict_yaml",
+ "yaml_name": "dreamstime_com_dyhead_objvg_e39",
+ "yaml_name_no_coco": "dreamstime_com_Detection_Pretrain_NoCOCO_Packed125",
+ "is_train": True,
+ },
+ # od to grounding
+ # coco tsv
+ "coco_dt_train": {
+ "dataset_file": "coco_dt",
+ "yaml_path": "coco_tsv/coco_obj.yaml",
+ "is_train": True,
+ },
+ "COCO_odinw_train_8copy_dt_train": {
+ "dataset_file": "coco_odinw_dt",
+ "yaml_path": "coco_tsv/COCO_odinw_train_8copy.yaml",
+ "is_train": True,
+ },
+ "COCO_odinw_val_dt_train": {
+ "dataset_file": "coco_odinw_dt",
+ "yaml_path": "coco_tsv/COCO_odinw_val.yaml",
+ "is_train": False,
+ },
+ # lvis tsv
+ "lvisv1_dt_train": {
+ "dataset_file": "lvisv1_dt",
+ "yaml_path": "coco_tsv/LVIS_v1_train.yaml",
+ "is_train": True,
+ },
+ "LVIS_odinw_train_8copy_dt_train": {
+ "dataset_file": "coco_odinw_dt",
+ "yaml_path": "coco_tsv/LVIS_odinw_train_8copy.yaml",
+ "is_train": True,
+ },
+ # object365 tsv
+ "object365_dt_train": {
+ "dataset_file": "object365_dt",
+ "yaml_path": "Objects365/objects365_train_vgoiv6.cas2000.yaml",
+ "is_train": True,
+ },
+ "object365_odinw_2copy_dt_train": {
+ "dataset_file": "object365_odinw_dt",
+ "yaml_path": "Objects365/objects365_train_odinw.cas2000_2copy.yaml",
+ "is_train": True,
+ },
+ "objects365_odtsv_train": {
+ "dataset_file": "objects365_odtsv",
+ "yaml_path": "Objects365/train.cas2000.yaml",
+ "is_train": True,
+ },
+ "objects365_odtsv_val": {
+ "dataset_file": "objects365_odtsv",
+ "yaml_path": "Objects365/val.yaml",
+ "is_train": False,
+ },
+ # ImagetNet OD
+ "imagenetod_train_odinw_2copy_dt": {
+ "dataset_file": "imagenetod_odinw_dt",
+ "yaml_path": "imagenet_od/imagenetod_train_odinw_2copy.yaml",
+ "is_train": True,
+ },
+ # OpenImage OD
+ "oi_train_odinw_dt": {
+ "dataset_file": "oi_odinw_dt",
+ "yaml_path": "openimages_v5c/oi_train_odinw.cas.2000.yaml",
+ "is_train": True,
+ },
+ # vg tsv
+ "vg_dt_train": {
+ "dataset_file": "vg_dt",
+ "yaml_path": "visualgenome/train_vgoi6_clipped.yaml",
+ "is_train": True,
+ },
+ "vg_odinw_clipped_8copy_dt_train": {
+ "dataset_file": "vg_odinw_clipped_8copy_dt",
+ "yaml_path": "visualgenome/train_odinw_clipped_8copy.yaml",
+ "is_train": True,
+ },
+ "vg_vgoi6_clipped_8copy_dt_train": {
+ "dataset_file": "vg_vgoi6_clipped_8copy_dt",
+ "yaml_path": "visualgenome/train_vgoi6_clipped_8copy.yaml",
+ "is_train": True,
+ },
+ # coco json
+ "coco_grounding_train": {
+ "img_dir": "coco/train2017",
+ "ann_file": "coco/annotations/instances_train2017.json",
+ "is_train": True,
+ },
+ "lvis_grounding_train": {"img_dir": "coco", "ann_file": "coco/annotations/lvis_od_train.json"},
+
+ "lvis_evaluation_val": {
+ "img_dir": "lvis/coco2017",
+ "ann_file": "lvis/lvis_v1_minival_inserted_image_name.json",
+ "is_train": False,
+ },
+
+ "lvis_val": {
+ "img_dir": "coco",
+ "ann_file": "coco/annotations/lvis_od_val.json"},
+
+
+ # legacy detection dataset
+ "hsd_v001": {"img_dir": "hsd/20170901_Detection_HeadShoulder.V001/RawImages", "ann_file": "hsd/HSD_V001.json"},
+ "hsd_hddb": {"img_dir": "hddb/Images", "ann_file": "hddb/HDDB.json"},
+ "opencoco_train": {"img_dir": "openimages/train", "ann_file": "openimages/opencoco_train.json"},
+ "opencoco_val": {"img_dir": "openimages/val", "ann_file": "openimages/opencoco_val.json"},
+ "opencoco_test": {"img_dir": "openimages/test", "ann_file": "openimages/opencoco_test.json"},
+ "openhuman_train": {"img_dir": "openimages/train", "ann_file": "openimages/openhuman_train.json"},
+ "openhuman_val": {"img_dir": "openimages/val", "ann_file": "openimages/openhuman_val.json"},
+ "openhuman_test": {"img_dir": "openimages/test", "ann_file": "openimages/openhuman_test.json"},
+ "opencrowd_train": {"img_dir": "openimages/train", "ann_file": "openimages/opencrowd_train.json"},
+ "opencrowd_val": {"img_dir": "openimages/val", "ann_file": "openimages/opencrowd_val.json"},
+ "opencrowd_test": {"img_dir": "openimages/test", "ann_file": "openimages/opencrowd_test.json"},
+ "opencar_train": {"img_dir": "openimages/train", "ann_file": "openimages/opencar_train.json"},
+ "opencar_val": {"img_dir": "openimages/val", "ann_file": "openimages/opencar_val.json"},
+ "opencar_test": {"img_dir": "openimages/test", "ann_file": "openimages/opencar_test.json"},
+ "openhumancar_train": {"img_dir": "openimages/train", "ann_file": "openimages/openhumancar_train.json"},
+ "openhumancar_val": {"img_dir": "openimages/val", "ann_file": "openimages/openhumancar_val.json"},
+ "openhuamncar_test": {"img_dir": "openimages/test", "ann_file": "openimages/openhumancar_test.json"},
+ "open500_train": {
+ "img_dir": "openimages/train",
+ "ann_file": "openimages/openimages_challenge_2019_train_bbox.json",
+ },
+ "open500_val": {
+ "img_dir": "openimages/val",
+ "ann_file": "openimages/openimages_challenge_2019_val_bbox.json",
+ },
+ "openproposal_test": {
+ "img_dir": "openimages/test2019",
+ "ann_file": "openimages/proposals_test.json",
+ },
+ "object365_train": {"img_dir": "object365/train", "ann_file": "object365/objects365_train.json"},
+ "object365_val": {"img_dir": "object365/val", "ann_file": "object365/objects365_val.json"},
+ "lvis_train": {"img_dir": "coco", "ann_file": "coco/annotations/lvis_od_train.json"},
+ "lvis_val": {"img_dir": "coco", "ann_file": "coco/annotations/lvis_od_val.json"},
+ "image200_train": {"img_dir": "imagenet-od/Data/DET/train", "ann_file": "imagenet-od/im200_train.json"},
+ "image200_val": {"img_dir": "imagenet-od/Data/DET/val", "ann_file": "imagenet-od/im200_val.json"},
+ "coco_2017_train": {"img_dir": "coco/train2017", "ann_file": "coco/annotations/instances_train2017.json"},
+ "coco_2017_val": {"img_dir": "coco/val2017", "ann_file": "coco/annotations/instances_val2017.json"},
+ "coco_2017_test": {"img_dir": "coco/test2017", "ann_file": "coco/annotations/image_info_test-dev2017.json"},
+ "coco10_train": {"img_dir": "coco/train2017", "ann_file": "coco/annotations/instances_minitrain2017.json"},
+ "coco_2014_train": {"img_dir": "coco/train2014", "ann_file": "coco/annotations/instances_train2014.json"},
+ "coco_2014_val": {"img_dir": "coco/val2014", "ann_file": "coco/annotations/instances_val2014.json"},
+ "coco_2014_minival": {"img_dir": "coco/val2014", "ann_file": "coco/annotations/instances_minival2014.json"},
+ "coco_2014_valminusminival": {
+ "img_dir": "coco/val2014",
+ "ann_file": "coco/annotations/instances_valminusminival2014.json",
+ },
+ "coco_2014_train_partial": {
+ "img_dir": "coco/train2014",
+ "ann_file": "coco/annotations/partial0.2_train2014.json",
+ },
+ "coco_2014_valminusminival_partial": {
+ "img_dir": "coco/val2014",
+ "ann_file": "coco/annotations/partial0.2_valminusminival2014.json",
+ },
+ "coco_2014_train_few100": {"img_dir": "coco/train2014", "ann_file": "coco/annotations/few100_train2014.json"},
+ "coco_2014_train_few300": {"img_dir": "coco/train2014", "ann_file": "coco/annotations/few300_train2014.json"},
+ "coco_human_2014_train": {"img_dir": "coco/train2014", "ann_file": "coco/annotations/humans_train2014.json"},
+ "coco_human_2014_minival": {"img_dir": "coco/val2014", "ann_file": "coco/annotations/humans_minival2014.json"},
+ "coco_human_2014_valminusminival": {
+ "img_dir": "coco/val2014",
+ "ann_file": "coco/annotations/humans_valminusminival2014.json",
+ },
+ "coco_car_2014_train": {"img_dir": "coco/train2014", "ann_file": "coco/annotations/car_train2014.json"},
+ "coco_car_2014_minival": {"img_dir": "coco/val2014", "ann_file": "coco/annotations/car_minival2014.json"},
+ "coco_car_2014_valminusminival": {
+ "img_dir": "coco/val2014",
+ "ann_file": "coco/annotations/car_valminusminival2014.json",
+ },
+ "coco_humancar_2014_train": {
+ "img_dir": "coco/train2014",
+ "ann_file": "coco/annotations/humancar_train2014.json",
+ },
+ "coco_humancar_2014_minival": {
+ "img_dir": "coco/val2014",
+ "ann_file": "coco/annotations/humancar_minival2014.json",
+ },
+ "coco_humancar_2014_valminusminival": {
+ "img_dir": "coco/val2014",
+ "ann_file": "coco/annotations/humancar_valminusminival2014.json",
+ },
+ "coco_keypoint_2017_train": {
+ "img_dir": "coco/train2017",
+ "ann_file": "coco/annotations/person_keypoints_train2017.json",
+ },
+ "coco_keypoint_2017_val": {
+ "img_dir": "coco/val2017",
+ "ann_file": "coco/annotations/person_keypoints_val2017.json",
+ },
+ "coco_headshoulder_2017_train": {
+ "img_dir": "coco/train2017",
+ "ann_file": "coco/annotations/headshoulder_train2017.json",
+ },
+ "coco_headshoulder_2017_val": {
+ "img_dir": "coco/val2017",
+ "ann_file": "coco/annotations/headshoulder_val2017.json",
+ },
+ "coco_hskeypoint_2017_train": {
+ "img_dir": "coco/train2017",
+ "ann_file": "coco/annotations/person_hskeypoints_train2017.json",
+ },
+ "coco_hskeypoint_2017_val": {
+ "img_dir": "coco/val2017",
+ "ann_file": "coco/annotations/person_hskeypoints_val2017.json",
+ },
+ "voc_2007_train": {"data_dir": "voc/VOC2007", "split": "train"},
+ "voc_2007_train_cocostyle": {
+ "img_dir": "voc/VOC2007/JPEGImages",
+ "ann_file": "voc/VOC2007/Annotations/pascal_train2007.json",
+ },
+ "voc_2007_val": {"data_dir": "voc/VOC2007", "split": "val"},
+ "voc_2007_val_cocostyle": {
+ "img_dir": "voc/VOC2007/JPEGImages",
+ "ann_file": "voc/VOC2007/Annotations/pascal_val2007.json",
+ },
+ "voc_2007_test": {"data_dir": "voc/VOC2007", "split": "test"},
+ "voc_2007_test_cocostyle": {
+ "img_dir": "voc/VOC2007/JPEGImages",
+ "ann_file": "voc/VOC2007/Annotations/pascal_test2007.json",
+ },
+ "voc_2012_train": {"data_dir": "voc/VOC2012", "split": "train"},
+ "voc_2012_train_cocostyle": {
+ "img_dir": "voc/VOC2012/JPEGImages",
+ "ann_file": "voc/VOC2012/Annotations/pascal_train2012.json",
+ },
+ "voc_2012_val": {"data_dir": "voc/VOC2012", "split": "val"},
+ "voc_2012_val_cocostyle": {
+ "img_dir": "voc/VOC2012/JPEGImages",
+ "ann_file": "voc/VOC2012/Annotations/pascal_val2012.json",
+ },
+ "voc_2012_test": {
+ "data_dir": "voc/VOC2012",
+ "split": "test"
+ # PASCAL VOC2012 doesn't made the test annotations available, so there's no json annotation
+ },
+ "cityscapes_fine_instanceonly_seg_train_cocostyle": {
+ "img_dir": "cityscapes/images",
+ "ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_train.json",
+ },
+ "cityscapes_fine_instanceonly_seg_val_cocostyle": {
+ "img_dir": "cityscapes/images",
+ "ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_val.json",
+ },
+ "cityscapes_fine_instanceonly_seg_test_cocostyle": {
+ "img_dir": "cityscapes/images",
+ "ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_test.json",
+ },
+ "crowdhuman_train": {"img_dir": "CrowdHuman/Images", "ann_file": "CrowdHuman/crowdhuman_train.json"},
+ "crowdhuman_val": {"img_dir": "CrowdHuman/Images", "ann_file": "CrowdHuman/crowdhuman_val.json"},
+ "crowdhead_train": {"img_dir": "CrowdHuman/Images", "ann_file": "CrowdHuman/crowdhead_train.json"},
+ "crowdhead_val": {"img_dir": "CrowdHuman/Images", "ann_file": "CrowdHuman/crowdhead_val.json"},
+ "crowdfull_train": {"img_dir": "CrowdHuman/Images", "ann_file": "CrowdHuman/crowdfull_train.json"},
+ "crowdfull_val": {"img_dir": "CrowdHuman/Images", "ann_file": "CrowdHuman/crowdfull_val.json"},
+ "ternium_train": {"img_dir": "ternium/images", "ann_file": "ternium/train_annotation.json"},
+ "ternium_val": {"img_dir": "ternium/images", "ann_file": "ternium/val_annotation.json"},
+ "ternium_test": {"img_dir": "ternium/images", "ann_file": "ternium/test_annotation.json"},
+ "ternium_test_crop": {"img_dir": "ternium/test_motion_crop", "ann_file": "ternium/test_motion_crop.json"},
+ "ternium_train_aug": {"img_dir": "ternium/train_crop_aug", "ann_file": "ternium/train_crop_aug.json"},
+ "ternium_test_aug": {"img_dir": "ternium/test_crop_aug", "ann_file": "ternium/test_motion_crop_aug.json"},
+ "ternium_vh_train": {
+ "img_dir": "ternium-vehicle/train_dataset_coco/images",
+ "ann_file": "ternium-vehicle/train_dataset_coco/coco_annotation.json",
+ },
+ "ternium_vh_val": {
+ "img_dir": "ternium-vehicle/validation_dataset_coco/images",
+ "ann_file": "ternium-vehicle/validation_dataset_coco/coco_annotation.json",
+ },
+ "msra_traffic": {"img_dir": "msra-traffic/Images", "ann_file": "msra-traffic/annotation.json"},
+ "msra_traffic_car": {"img_dir": "msra-traffic/Images", "ann_file": "msra-traffic/car_annotation.json"},
+ "msra_traffic_humancar": {
+ "img_dir": "msra-traffic/Images",
+ "ann_file": "msra-traffic/humancar_annotation.json",
+ },
+ "jigsaw_car_train": {"img_dir": "jigsaw", "ann_file": "jigsaw/train.json"},
+ "jigsaw_car_val": {"img_dir": "jigsaw", "ann_file": "jigsaw/val.json"},
+ "miotcd_train": {"img_dir": "MIO-TCD/MIO-TCD-Localization", "ann_file": "MIO-TCD/train.json"},
+ "miotcd_val": {"img_dir": "MIO-TCD/MIO-TCD-Localization", "ann_file": "MIO-TCD/val.json"},
+ "detrac_train": {"img_dir": "detrac/Insight-MVT_Annotation_Train", "ann_file": "detrac/train.json"},
+ "detrac_val": {"img_dir": "detrac/Insight-MVT_Annotation_Train", "ann_file": "detrac/val.json"},
+ "mrw": {"img_dir": "mrw/clips", "ann_file": "mrw/annotations.json"},
+ "mrw_bg": {"img_dir": "mrw/bg", "ann_file": "mrw/bg_annotations.json"},
+ "webmarket_bg": {"img_dir": "webmarket", "ann_file": "webmarket/bg_annotations.json"},
+ "mot17_train": {"img_dir": "mot/MOT17Det", "ann_file": "mot/MOT17Det/train.json"},
+ "egohands": {"img_dir": "egohands/images", "ann_file": "egohands/egohands.json"},
+ "hof": {"img_dir": "hof/images_original_size", "ann_file": "hof/train.json"},
+ "vlmhof": {"img_dir": "vlmhof/RGB", "ann_file": "vlmhof/train.json"},
+ "vgghands_train": {"img_dir": "vgghands/training_dataset", "ann_file": "vgghands/training.json"},
+ "vgghands_val": {"img_dir": "vgghands/validation_dataset", "ann_file": "vgghands/validation.json"},
+ "vgghands_test": {"img_dir": "vgghands/test_dataset", "ann_file": "vgghands/test.json"},
+ "od:coco_train": {"img_dir": "coco/train2017", "ann_file": "coco/annotations/od_train2017.json"},
+ "od:coco_val": {"img_dir": "coco/val2017", "ann_file": "coco/annotations/od_val2017.json"},
+ "od:lvis_train": {"img_dir": "coco", "ann_file": "coco/annotations/od_train-lvis.json"},
+ "od:lvis_val": {"img_dir": "coco", "ann_file": "coco/annotations/od_val-lvis.json"},
+ "od:o365_train": {"img_dir": "object365/train", "ann_file": "object365/od_train.json"},
+ "od:o365_val": {"img_dir": "object365/val", "ann_file": "object365/od_val.json"},
+ "od:oi500_train": {
+ "img_dir": "openimages/train",
+ "ann_file": "openimages/od_train2019.json",
+ "paste_dir": "openimages/panoptic_train_challenge_2019",
+ "paste_file": "openimages/panoptic_train2019.json",
+ },
+ "od:oi500_val": {
+ "img_dir": "openimages/val",
+ "ann_file": "openimages/od_val2019.json",
+ "paste_dir": "openimages/panoptic_val_challenge_2019",
+ "paste_file": "openimages/panoptic_val2019.json",
+ },
+ "od:im200_train": {"img_dir": "imagenet-od/Data/DET/train", "ann_file": "imagenet-od/train.json"},
+ "od:im200_val": {"img_dir": "imagenet-od/Data/DET/val", "ann_file": "imagenet-od/val.json"},
+ "cv:animal661_train": {"img_dir": "cvtasks/animal-661/images", "ann_file": "cvtasks/animal-661/train.json"},
+ "cv:animal661_test": {"img_dir": "cvtasks/animal-661/images", "ann_file": "cvtasks/animal-661/test.json"},
+ "cv:seeingai_train": {"img_dir": "cvtasks/SeeingAI/train.tsv", "ann_file": "cvtasks/SeeingAI/train.json"},
+ "cv:seeingai_test": {"img_dir": "cvtasks/SeeingAI/test.tsv", "ann_file": "cvtasks/SeeingAI/test.json"},
+ "cv:office_train": {
+ "img_dir": "cvtasks/Ping-Office-Env/train.tsv",
+ "ann_file": "cvtasks/Ping-Office-Env/train.json",
+ },
+ "cv:office_test": {
+ "img_dir": "cvtasks/Ping-Office-Env/test.tsv",
+ "ann_file": "cvtasks/Ping-Office-Env/test.json",
+ },
+ "cv:logo_train": {"img_dir": "cvtasks/Ping-Logo", "ann_file": "cvtasks/Ping-Logo/train.json"},
+ "cv:logo_test": {"img_dir": "cvtasks/Ping-Logo", "ann_file": "cvtasks/Ping-Logo/test.json"},
+ "cv:nba_train": {"img_dir": "cvtasks/Ping-NBA", "ann_file": "cvtasks/Ping-NBA/train.json"},
+ "cv:nba_test": {"img_dir": "cvtasks/Ping-NBA", "ann_file": "cvtasks/Ping-NBA/test.json"},
+ "cv:traffic_train": {"img_dir": "cvtasks/TrafficData/train.tsv", "ann_file": "cvtasks/TrafficData/train.json"},
+ "cv:traffic_test": {"img_dir": "cvtasks/TrafficData/test.tsv", "ann_file": "cvtasks/TrafficData/test.json"},
+ "cv:fashion5k_train": {"img_dir": "cvtasks/fashion5k", "ann_file": "cvtasks/fashion5k/train.json"},
+ "cv:fashion5k_test": {"img_dir": "cvtasks/fashion5k", "ann_file": "cvtasks/fashion5k/test.json"},
+ "cv:malaria_train": {"img_dir": "cvtasks/malaria", "ann_file": "cvtasks/malaria/train.json"},
+ "cv:malaria_test": {"img_dir": "cvtasks/malaria", "ann_file": "cvtasks/malaria/test.json"},
+ "cv:product_train": {
+ "img_dir": "cvtasks/product_detection",
+ "ann_file": "cvtasks/product_detection/train.json",
+ },
+ "cv:product_test": {"img_dir": "cvtasks/product_detection", "ann_file": "cvtasks/product_detection/test.json"},
+ "vl:vg_train": {"yaml_file": "vlp/visualgenome/train_vgoi6_clipped.yaml"},
+ "vl:vg_test": {"yaml_file": "vlp/visualgenome/test_vgoi6_clipped.yaml"},
+ "imagenet_train": {"img_dir": "imagenet-tsv/train.tsv", "ann_file": None},
+ "imagenet_val": {"img_dir": "imagenet-tsv/val.tsv", "ann_file": None},
+
+ "paco_lvis_v1_train_grounding":{
+ "img_dir": "coco",
+ "ann_file": "paco/paco_lvis_v1_train.json"
+ },
+
+ "paco_lvis_v1_val":{
+ "img_dir": "coco",
+ "ann_file": "paco/paco_lvis_v1_val.json"
+ },
+ "paco_lvis_v1_test":
+ {
+ "img_dir": "coco",
+ "ann_file": "paco/paco_lvis_v1_test.json"
+ },
+ "omnilabel_val": {"img_dir": "omnilabel/", "ann_file": "omnilabel/dataset_all_val_v0.1.3.json"},
+ "omnilabel_val_coco": {"img_dir": "omnilabel/", "ann_file": "omnilabel/dataset_all_val_v0.1.3_coco.json"},
+ "omnilabel_val_o365": {"img_dir": "omnilabel/", "ann_file": "omnilabel/dataset_all_val_v0.1.3_object365.json"},
+ "omnilabel_val_oi_v5": {"img_dir": "omnilabel/", "ann_file": "omnilabel/dataset_all_val_v0.1.3_openimagesv5.json"},
+ "omnilabel_test": {"img_dir": "omnilabel/", "ann_file": "omnilabel/dataset_all_test_v0.1.3.json"},
+ }
+
+ @staticmethod
+ def set(name, info):
+ DatasetCatalog.DATASETS.update({name: info})
+
+ @staticmethod
+ def get(name):
+
+ if name.endswith("_bg"):
+ attrs = DatasetCatalog.DATASETS[name]
+ data_dir = try_to_find(attrs["ann_file"], return_dir=True)
+ args = dict(
+ root=os.path.join(data_dir, attrs["img_dir"]),
+ ann_file=os.path.join(data_dir, attrs["ann_file"]),
+ )
+ return dict(
+ factory="Background",
+ args=args,
+ )
+ else:
+ if "bing" in name.split("_"):
+ attrs = DatasetCatalog.DATASETS["bing_caption_train"]
+ else:
+ attrs = DatasetCatalog.DATASETS[name]
+ # if "yaml_file" in attrs:
+ # yaml_file = try_to_find(attrs["yaml_file"], return_dir=False)
+ # args = dict(yaml_file=yaml_file)
+ # return dict(
+ # factory="VGTSVDataset",
+ # args=args,
+ # )
+ # elif attrs["img_dir"].endswith('tsv'):
+ # try:
+ # data_dir = try_to_find(attrs["img_dir"], return_dir=True)
+ # if attrs["ann_file"] is None:
+ # map_file = None
+ # elif attrs["ann_file"].startswith("./"):
+ # map_file = attrs["ann_file"]
+ # else:
+ # map_file = os.path.join(data_dir, attrs["ann_file"])
+ # except:
+ # return None
+ # args = dict(
+ # tsv_file=os.path.join(data_dir, attrs["img_dir"]),
+ # anno_file=map_file,
+ # )
+ # return dict(
+ # factory="TSVDataset",
+ # args=args,
+ # )
+ if "voc" in name and "split" in attrs:
+ data_dir = try_to_find(attrs["data_dir"], return_dir=True)
+ args = dict(
+ data_dir=os.path.join(data_dir, attrs["data_dir"]),
+ split=attrs["split"],
+ )
+ return dict(
+ factory="PascalVOCDataset",
+ args=args,
+ )
+ elif "omnilabel" in name:
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+ args = dict(
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+ )
+ return dict(
+ factory="OmniLabelDataset",
+ args=args,
+ )
+ elif "mixed" in name:
+ vg_img_dir = try_to_find(attrs["vg_img_dir"], return_dir=True)
+ coco_img_dir = try_to_find(attrs["coco_img_dir"], return_dir=True)
+ ann_file = try_to_find(attrs["ann_file"], return_dir=True)
+ args = dict(
+ img_folder_coco=os.path.join(coco_img_dir, attrs["coco_img_dir"]),
+ img_folder_vg=os.path.join(vg_img_dir, attrs["vg_img_dir"]),
+ ann_file=os.path.join(ann_file, attrs["ann_file"]),
+ )
+ return dict(
+ factory="MixedDataset",
+ args=args,
+ )
+ elif "flickr" in name:
+ img_dir = try_to_find(attrs["img_folder"], return_dir=True)
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+ args = dict(
+ img_folder=os.path.join(img_dir, attrs["img_folder"]),
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+ is_train=attrs["is_train"],
+ )
+ return dict(
+ factory="FlickrDataset",
+ args=args,
+ )
+ elif "refexp" in name or "refcoco" in name:
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+ args = dict(
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+ )
+ return dict(
+ factory="RefExpDataset",
+ args=args,
+ )
+ elif "gqa" in name:
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+ args = dict(
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+ )
+ return dict(
+ factory="GQADataset",
+ args=args,
+ )
+ elif "phrasecut" in name:
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+ args = dict(
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+ )
+ return dict(
+ factory="PhrasecutDetection",
+ args=args,
+ )
+ elif "_caption" in name:
+ yaml_path = try_to_find(attrs["yaml_path"], return_dir=True)
+ if "no_coco" in name:
+ yaml_name = attrs["yaml_name_no_coco"]
+ else:
+ yaml_name = attrs["yaml_name"]
+ yaml_file_name = "{}.{}.yaml".format(yaml_name, name.split("_")[2])
+ args = dict(yaml_file=os.path.join(yaml_path, attrs["yaml_path"], yaml_file_name))
+ return dict(
+ factory="CaptionTSV",
+ args=args,
+ )
+ elif "inferencecap" in name:
+ yaml_file_name = try_to_find(attrs["yaml_path"])
+ args = dict(yaml_file=yaml_file_name)
+ return dict(
+ factory="CaptionTSV",
+ args=args,
+ )
+ elif "pseudo_data" in name:
+ args = dict(yaml_file=try_to_find(attrs["yaml_path"]))
+ return dict(
+ factory="PseudoData",
+ args=args,
+ )
+ elif "_dt" in name:
+ dataset_file = attrs["dataset_file"]
+ yaml_path = try_to_find(attrs["yaml_path"], return_dir=True)
+ args = dict(
+ name=dataset_file,
+ yaml_file=os.path.join(yaml_path, attrs["yaml_path"]),
+ )
+ return dict(
+ factory="CocoDetectionTSV",
+ args=args,
+ )
+ elif "_odtsv" in name:
+ dataset_file = attrs["dataset_file"]
+ yaml_path = try_to_find(attrs["yaml_path"], return_dir=True)
+ args = dict(
+ name=dataset_file,
+ yaml_file=os.path.join(yaml_path, attrs["yaml_path"]),
+ )
+ return dict(
+ factory="ODTSVDataset",
+ args=args,
+ )
+ elif "_grounding" in name:
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+ args = dict(
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+ )
+ return dict(
+ factory="CocoGrounding",
+ args=args,
+ )
+ elif "lvis_evaluation" in name:
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+ args = dict(
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+ )
+ return dict(
+ factory="LvisDetection",
+ args=args,
+ )
+ elif "paco" in name:
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+ args = dict(
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+ )
+ return dict(
+ factory="PacoDetection",
+ args=args,
+ )
+ else:
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+ args = dict(
+ root=os.path.join(img_dir, attrs["img_dir"]),
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+ )
+ for k, v in attrs.items():
+ args.update({k: os.path.join(ann_dir, v)})
+ return dict(
+ factory="COCODataset",
+ args=args,
+ )
+
+ raise RuntimeError("Dataset not available: {}".format(name))
+
+
+class ModelCatalog(object):
+ S3_C2_DETECTRON_URL = "https://dl.fbaipublicfiles.com/detectron"
+ C2_IMAGENET_MODELS = {
+ "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl",
+ "MSRA/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl",
+ "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl",
+ "MSRA/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl",
+ "FAIR/20171220/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl",
+ "FAIR/20171220/X-101-64x4d": "ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl",
+ }
+
+ C2_DETECTRON_SUFFIX = "output/train/coco_2014_train%3Acoco_2014_valminusminival/generalized_rcnn/model_final.pkl"
+ C2_DETECTRON_MODELS = {
+ "35857197/e2e_faster_rcnn_R-50-C4_1x": "01_33_49.iAX0mXvW",
+ "35857345/e2e_faster_rcnn_R-50-FPN_1x": "01_36_30.cUF7QR7I",
+ "35857890/e2e_faster_rcnn_R-101-FPN_1x": "01_38_50.sNxI7sX7",
+ "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "06_31_39.5MIHi1fZ",
+ "35858791/e2e_mask_rcnn_R-50-C4_1x": "01_45_57.ZgkA7hPB",
+ "35858933/e2e_mask_rcnn_R-50-FPN_1x": "01_48_14.DzEQe4wC",
+ "35861795/e2e_mask_rcnn_R-101-FPN_1x": "02_31_37.KqyEK4tT",
+ "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "06_35_59.RZotkLKI",
+ }
+
+ @staticmethod
+ def get(name):
+ if name.startswith("Caffe2Detectron/COCO"):
+ return ModelCatalog.get_c2_detectron_12_2017_baselines(name)
+ if name.startswith("ImageNetPretrained"):
+ return ModelCatalog.get_c2_imagenet_pretrained(name)
+ raise RuntimeError("model not present in the catalog {}".format(name))
+
+ @staticmethod
+ def get_c2_imagenet_pretrained(name):
+ prefix = ModelCatalog.S3_C2_DETECTRON_URL
+ name = name[len("ImageNetPretrained/") :]
+ name = ModelCatalog.C2_IMAGENET_MODELS[name]
+ url = "/".join([prefix, name])
+ return url
+
+ @staticmethod
+ def get_c2_detectron_12_2017_baselines(name):
+ # Detectron C2 models are stored following the structure
+ # prefix//2012_2017_baselines/.yaml./suffix
+ # we use as identifiers in the catalog Caffe2Detectron/COCO//
+ prefix = ModelCatalog.S3_C2_DETECTRON_URL
+ suffix = ModelCatalog.C2_DETECTRON_SUFFIX
+ # remove identification prefix
+ name = name[len("Caffe2Detectron/COCO/") :]
+ # split in and
+ model_id, model_name = name.split("/")
+ # parsing to make it match the url address from the Caffe2 models
+ model_name = "{}.yaml".format(model_name)
+ signature = ModelCatalog.C2_DETECTRON_MODELS[name]
+ unique_name = ".".join([model_name, signature])
+ url = "/".join([prefix, model_id, "12_2017_baselines", unique_name, suffix])
+ return url
diff --git a/maskrcnn_benchmark/csrc/ROIAlign.h b/maskrcnn_benchmark/csrc/ROIAlign.h
new file mode 100644
index 0000000000000000000000000000000000000000..2683dbf52e120eebb7b60bb2257cd3527c5a86c3
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/ROIAlign.h
@@ -0,0 +1,46 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+
+// Interface for Python
+at::Tensor ROIAlign_forward(const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio) {
+ if (input.device().is_cuda()) {
+#ifdef WITH_CUDA
+ return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+#else
+ AT_ERROR("Not compiled with GPU support");
+#endif
+ }
+ return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+}
+
+at::Tensor ROIAlign_backward(const at::Tensor& grad,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int batch_size,
+ const int channels,
+ const int height,
+ const int width,
+ const int sampling_ratio) {
+ if (grad.device().is_cuda()) {
+#ifdef WITH_CUDA
+ return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio);
+#else
+ AT_ERROR("Not compiled with GPU support");
+#endif
+ }
+ AT_ERROR("Not implemented on the CPU");
+}
+
diff --git a/maskrcnn_benchmark/csrc/ROIPool.h b/maskrcnn_benchmark/csrc/ROIPool.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b62b2dcb8f69ac65bc1fdf0eeb5fa556539bc13
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/ROIPool.h
@@ -0,0 +1,48 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+
+
+std::tuple ROIPool_forward(const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width) {
+ if (input.device().is_cuda()) {
+#ifdef WITH_CUDA
+ return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width);
+#else
+ AT_ERROR("Not compiled with GPU support");
+#endif
+ }
+ AT_ERROR("Not implemented on the CPU");
+}
+
+at::Tensor ROIPool_backward(const at::Tensor& grad,
+ const at::Tensor& input,
+ const at::Tensor& rois,
+ const at::Tensor& argmax,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int batch_size,
+ const int channels,
+ const int height,
+ const int width) {
+ if (grad.device().is_cuda()) {
+#ifdef WITH_CUDA
+ return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width);
+#else
+ AT_ERROR("Not compiled with GPU support");
+#endif
+ }
+ AT_ERROR("Not implemented on the CPU");
+}
+
+
+
diff --git a/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h b/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h
new file mode 100644
index 0000000000000000000000000000000000000000..e220c12ae558a176f6b4b0a6640e724358f2ecb0
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/SigmoidFocalLoss.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+
+// Interface for Python
+at::Tensor SigmoidFocalLoss_forward(
+ const at::Tensor& logits,
+ const at::Tensor& targets,
+ const int num_classes,
+ const float gamma,
+ const float alpha) {
+ if (logits.device().is_cuda()) {
+#ifdef WITH_CUDA
+ return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha);
+#else
+ AT_ERROR("Not compiled with GPU support");
+#endif
+ }
+ AT_ERROR("Not implemented on the CPU");
+}
+
+at::Tensor SigmoidFocalLoss_backward(
+ const at::Tensor& logits,
+ const at::Tensor& targets,
+ const at::Tensor& d_losses,
+ const int num_classes,
+ const float gamma,
+ const float alpha) {
+ if (logits.device().is_cuda()) {
+#ifdef WITH_CUDA
+ return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha);
+#else
+ AT_ERROR("Not compiled with GPU support");
+#endif
+ }
+ AT_ERROR("Not implemented on the CPU");
+}
diff --git a/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0c061351588df7752293ed84bba1c900768e3ab8
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp
@@ -0,0 +1,257 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include "cpu/vision.h"
+
+// implementation taken from Caffe2
+template
+struct PreCalc {
+ int pos1;
+ int pos2;
+ int pos3;
+ int pos4;
+ T w1;
+ T w2;
+ T w3;
+ T w4;
+};
+
+template
+void pre_calc_for_bilinear_interpolate(
+ const int height,
+ const int width,
+ const int pooled_height,
+ const int pooled_width,
+ const int iy_upper,
+ const int ix_upper,
+ T roi_start_h,
+ T roi_start_w,
+ T bin_size_h,
+ T bin_size_w,
+ int roi_bin_grid_h,
+ int roi_bin_grid_w,
+ std::vector>& pre_calc) {
+ int pre_calc_index = 0;
+ for (int ph = 0; ph < pooled_height; ph++) {
+ for (int pw = 0; pw < pooled_width; pw++) {
+ for (int iy = 0; iy < iy_upper; iy++) {
+ const T yy = roi_start_h + ph * bin_size_h +
+ static_cast(iy + .5f) * bin_size_h /
+ static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
+ for (int ix = 0; ix < ix_upper; ix++) {
+ const T xx = roi_start_w + pw * bin_size_w +
+ static_cast(ix + .5f) * bin_size_w /
+ static_cast(roi_bin_grid_w);
+
+ T x = xx;
+ T y = yy;
+ // deal with: inverse elements are out of feature map boundary
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
+ // empty
+ PreCalc pc;
+ pc.pos1 = 0;
+ pc.pos2 = 0;
+ pc.pos3 = 0;
+ pc.pos4 = 0;
+ pc.w1 = 0;
+ pc.w2 = 0;
+ pc.w3 = 0;
+ pc.w4 = 0;
+ pre_calc[pre_calc_index] = pc;
+ pre_calc_index += 1;
+ continue;
+ }
+
+ if (y <= 0) {
+ y = 0;
+ }
+ if (x <= 0) {
+ x = 0;
+ }
+
+ int y_low = (int)y;
+ int x_low = (int)x;
+ int y_high;
+ int x_high;
+
+ if (y_low >= height - 1) {
+ y_high = y_low = height - 1;
+ y = (T)y_low;
+ } else {
+ y_high = y_low + 1;
+ }
+
+ if (x_low >= width - 1) {
+ x_high = x_low = width - 1;
+ x = (T)x_low;
+ } else {
+ x_high = x_low + 1;
+ }
+
+ T ly = y - y_low;
+ T lx = x - x_low;
+ T hy = 1. - ly, hx = 1. - lx;
+ T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+ // save weights and indeces
+ PreCalc pc;
+ pc.pos1 = y_low * width + x_low;
+ pc.pos2 = y_low * width + x_high;
+ pc.pos3 = y_high * width + x_low;
+ pc.pos4 = y_high * width + x_high;
+ pc.w1 = w1;
+ pc.w2 = w2;
+ pc.w3 = w3;
+ pc.w4 = w4;
+ pre_calc[pre_calc_index] = pc;
+
+ pre_calc_index += 1;
+ }
+ }
+ }
+ }
+}
+
+template
+void ROIAlignForward_cpu_kernel(
+ const int nthreads,
+ const T* bottom_data,
+ const T& spatial_scale,
+ const int channels,
+ const int height,
+ const int width,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio,
+ const T* bottom_rois,
+ //int roi_cols,
+ T* top_data) {
+ //AT_ASSERT(roi_cols == 4 || roi_cols == 5);
+ int roi_cols = 5;
+
+ int n_rois = nthreads / channels / pooled_width / pooled_height;
+ // (n, c, ph, pw) is an element in the pooled output
+ // can be parallelized using omp
+ // #pragma omp parallel for num_threads(32)
+ for (int n = 0; n < n_rois; n++) {
+ int index_n = n * channels * pooled_width * pooled_height;
+
+ // roi could have 4 or 5 columns
+ const T* offset_bottom_rois = bottom_rois + n * roi_cols;
+ int roi_batch_ind = 0;
+ if (roi_cols == 5) {
+ roi_batch_ind = offset_bottom_rois[0];
+ offset_bottom_rois++;
+ }
+
+ // Do not using rounding; this implementation detail is critical
+ T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+ T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+ T roi_end_w = offset_bottom_rois[2] * spatial_scale;
+ T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+ // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
+ // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
+ // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
+ // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
+
+ // Force malformed ROIs to be 1x1
+ T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
+ T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
+
+ // We use roi_bin_grid to sample the grid and mimic integral
+ int roi_bin_grid_h = (sampling_ratio > 0)
+ ? sampling_ratio
+ : ceil(roi_height / pooled_height); // e.g., = 2
+ int roi_bin_grid_w =
+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+ // We do average (integral) pooling inside a bin
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+ // we want to precalculate indeces and weights shared by all chanels,
+ // this is the key point of optimiation
+ std::vector> pre_calc(
+ roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+ pre_calc_for_bilinear_interpolate(
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ roi_bin_grid_h,
+ roi_bin_grid_w,
+ roi_start_h,
+ roi_start_w,
+ bin_size_h,
+ bin_size_w,
+ roi_bin_grid_h,
+ roi_bin_grid_w,
+ pre_calc);
+
+ for (int c = 0; c < channels; c++) {
+ int index_n_c = index_n + c * pooled_width * pooled_height;
+ const T* offset_bottom_data =
+ bottom_data + (roi_batch_ind * channels + c) * height * width;
+ int pre_calc_index = 0;
+
+ for (int ph = 0; ph < pooled_height; ph++) {
+ for (int pw = 0; pw < pooled_width; pw++) {
+ int index = index_n_c + ph * pooled_width + pw;
+
+ T output_val = 0.;
+ for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+ for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+ PreCalc pc = pre_calc[pre_calc_index];
+ output_val += pc.w1 * offset_bottom_data[pc.pos1] +
+ pc.w2 * offset_bottom_data[pc.pos2] +
+ pc.w3 * offset_bottom_data[pc.pos3] +
+ pc.w4 * offset_bottom_data[pc.pos4];
+
+ pre_calc_index += 1;
+ }
+ }
+ output_val /= count;
+
+ top_data[index] = output_val;
+ } // for pw
+ } // for ph
+ } // for c
+ } // for n
+}
+
+at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio) {
+ AT_ASSERTM(!input.device().is_cuda(), "input must be a CPU tensor");
+ AT_ASSERTM(!rois.device().is_cuda(), "rois must be a CPU tensor");
+
+ auto num_rois = rois.size(0);
+ auto channels = input.size(1);
+ auto height = input.size(2);
+ auto width = input.size(3);
+
+ auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
+
+ if (output.numel() == 0) {
+ return output;
+ }
+
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
+ ROIAlignForward_cpu_kernel(
+ output_size,
+ input.data_ptr(),
+ spatial_scale,
+ channels,
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ sampling_ratio,
+ rois.data_ptr(),
+ output.data_ptr());
+ });
+ return output;
+}
diff --git a/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..11b7aa60fdca907352b334f142faadb46d662f99
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp
@@ -0,0 +1,75 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include "cpu/vision.h"
+
+
+template
+at::Tensor nms_cpu_kernel(const at::Tensor& dets,
+ const at::Tensor& scores,
+ const float threshold) {
+ AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
+ AT_ASSERTM(!scores.device().is_cuda(), "scores must be a CPU tensor");
+ AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
+
+ if (dets.numel() == 0) {
+ return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
+ }
+
+ auto x1_t = dets.select(1, 0).contiguous();
+ auto y1_t = dets.select(1, 1).contiguous();
+ auto x2_t = dets.select(1, 2).contiguous();
+ auto y2_t = dets.select(1, 3).contiguous();
+
+ at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
+
+ auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+ auto ndets = dets.size(0);
+ at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
+
+ auto suppressed = suppressed_t.data_ptr();
+ auto order = order_t.data_ptr();
+ auto x1 = x1_t.data_ptr();
+ auto y1 = y1_t.data_ptr();
+ auto x2 = x2_t.data_ptr();
+ auto y2 = y2_t.data_ptr();
+ auto areas = areas_t.data_ptr();
+
+ for (int64_t _i = 0; _i < ndets; _i++) {
+ auto i = order[_i];
+ if (suppressed[i] == 1)
+ continue;
+ auto ix1 = x1[i];
+ auto iy1 = y1[i];
+ auto ix2 = x2[i];
+ auto iy2 = y2[i];
+ auto iarea = areas[i];
+
+ for (int64_t _j = _i + 1; _j < ndets; _j++) {
+ auto j = order[_j];
+ if (suppressed[j] == 1)
+ continue;
+ auto xx1 = std::max(ix1, x1[j]);
+ auto yy1 = std::max(iy1, y1[j]);
+ auto xx2 = std::min(ix2, x2[j]);
+ auto yy2 = std::min(iy2, y2[j]);
+
+ auto w = std::max(static_cast(0), xx2 - xx1 + 1);
+ auto h = std::max(static_cast(0), yy2 - yy1 + 1);
+ auto inter = w * h;
+ auto ovr = inter / (iarea + areas[j] - inter);
+ if (ovr >= threshold)
+ suppressed[j] = 1;
+ }
+ }
+ return at::nonzero(suppressed_t == 0).squeeze(1);
+}
+
+at::Tensor nms_cpu(const at::Tensor& dets,
+ const at::Tensor& scores,
+ const float threshold) {
+ at::Tensor result;
+ AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] {
+ result = nms_cpu_kernel(dets, scores, threshold);
+ });
+ return result;
+}
diff --git a/maskrcnn_benchmark/csrc/cpu/soft_nms.cpp b/maskrcnn_benchmark/csrc/cpu/soft_nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..423941d71e29f5b9823006d57cdf0088646586ed
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cpu/soft_nms.cpp
@@ -0,0 +1,117 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include "cpu/vision.h"
+
+
+template
+std::pair soft_nms_cpu_kernel(const at::Tensor& dets,
+ const at::Tensor& scores,
+ const float threshold,
+ const float sigma) {
+ AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
+ AT_ASSERTM(!scores.device().is_cuda(), "scores must be a CPU tensor");
+ AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
+
+ if (dets.numel() == 0) {
+ return std::make_pair(at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)),
+ at::empty({0}, scores.options().dtype(at::kFloat).device(at::kCPU)));
+ }
+
+ auto x1_t = dets.select(1, 0).contiguous();
+ auto y1_t = dets.select(1, 1).contiguous();
+ auto x2_t = dets.select(1, 2).contiguous();
+ auto y2_t = dets.select(1, 3).contiguous();
+
+ auto scores_t = scores.clone();
+
+ at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
+ auto ndets = dets.size(0);
+ auto inds_t = at::arange(ndets, dets.options().dtype(at::kLong).device(at::kCPU));
+
+ auto x1 = x1_t.data_ptr();
+ auto y1 = y1_t.data_ptr();
+ auto x2 = x2_t.data_ptr();
+ auto y2 = y2_t.data_ptr();
+ auto s = scores_t.data_ptr();
+ auto inds = inds_t.data_ptr();
+ auto areas = areas_t.data_ptr();
+
+ for (int64_t i = 0; i < ndets; i++) {
+
+ auto ix1 = x1[i];
+ auto iy1 = y1[i];
+ auto ix2 = x2[i];
+ auto iy2 = y2[i];
+ auto is = s[i];
+ auto ii = inds[i];
+ auto iarea = areas[i];
+
+ auto maxpos = scores_t.slice(0, i, ndets).argmax().item() + i;
+
+ // add max box as a detection
+ x1[i] = x1[maxpos];
+ y1[i] = y1[maxpos];
+ x2[i] = x2[maxpos];
+ y2[i] = y2[maxpos];
+ s[i] = s[maxpos];
+ inds[i] = inds[maxpos];
+ areas[i] = areas[maxpos];
+
+ // swap ith box with position of max box
+ x1[maxpos] = ix1;
+ y1[maxpos] = iy1;
+ x2[maxpos] = ix2;
+ y2[maxpos] = iy2;
+ s[maxpos] = is;
+ inds[maxpos] = ii;
+ areas[maxpos] = iarea;
+
+ ix1 = x1[i];
+ iy1 = y1[i];
+ ix2 = x2[i];
+ iy2 = y2[i];
+ iarea = areas[i];
+
+ // NMS iterations, note that ndets changes if detection boxes
+ // fall below threshold
+ for (int64_t j = i + 1; j < ndets; j++) {
+ auto xx1 = std::max(ix1, x1[j]);
+ auto yy1 = std::max(iy1, y1[j]);
+ auto xx2 = std::min(ix2, x2[j]);
+ auto yy2 = std::min(iy2, y2[j]);
+
+ auto w = std::max(static_cast(0), xx2 - xx1 + 1);
+ auto h = std::max(static_cast(0), yy2 - yy1 + 1);
+
+ auto inter = w * h;
+ auto ovr = inter / (iarea + areas[j] - inter);
+
+ s[j] = s[j] * std::exp(- std::pow(ovr, 2.0) / sigma);
+
+ // if box score falls below threshold, discard the box by
+ // swapping with last box update ndets
+ if (s[j] < threshold) {
+ x1[j] = x1[ndets - 1];
+ y1[j] = y1[ndets - 1];
+ x2[j] = x2[ndets - 1];
+ y2[j] = y2[ndets - 1];
+ s[j] = s[ndets - 1];
+ inds[j] = inds[ndets - 1];
+ areas[j] = areas[ndets - 1];
+ j--;
+ ndets--;
+ }
+ }
+ }
+ return std::make_pair(inds_t.slice(0, 0, ndets), scores_t.slice(0, 0, ndets));
+}
+
+std::pair soft_nms_cpu(const at::Tensor& dets,
+ const at::Tensor& scores,
+ const float threshold,
+ const float sigma) {
+ std::pair result;
+ AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "soft_nms", [&] {
+ result = soft_nms_cpu_kernel(dets, scores, threshold, sigma);
+ });
+ return result;
+}
\ No newline at end of file
diff --git a/maskrcnn_benchmark/csrc/cpu/vision.h b/maskrcnn_benchmark/csrc/cpu/vision.h
new file mode 100644
index 0000000000000000000000000000000000000000..e00ef683150eb9d46d0e4f6a30f55a7230a52e93
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cpu/vision.h
@@ -0,0 +1,22 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+#include
+
+
+at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio);
+
+
+at::Tensor nms_cpu(const at::Tensor& dets,
+ const at::Tensor& scores,
+ const float threshold);
+
+
+std::pair soft_nms_cpu(const at::Tensor& dets,
+ const at::Tensor& scores,
+ const float threshold,
+ const float sigma);
\ No newline at end of file
diff --git a/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9ed1a0adfd841a17d3574dee6ac703820fcfe144
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu
@@ -0,0 +1,346 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include
+#include
+
+#include
+#include
+#include
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+ i += blockDim.x * gridDim.x)
+
+
+template
+__device__ T bilinear_interpolate(const T* bottom_data,
+ const int height, const int width,
+ T y, T x,
+ const int index /* index for debug only*/) {
+
+ // deal with cases that inverse elements are out of feature map boundary
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
+ //empty
+ return 0;
+ }
+
+ if (y <= 0) y = 0;
+ if (x <= 0) x = 0;
+
+ int y_low = (int) y;
+ int x_low = (int) x;
+ int y_high;
+ int x_high;
+
+ if (y_low >= height - 1) {
+ y_high = y_low = height - 1;
+ y = (T) y_low;
+ } else {
+ y_high = y_low + 1;
+ }
+
+ if (x_low >= width - 1) {
+ x_high = x_low = width - 1;
+ x = (T) x_low;
+ } else {
+ x_high = x_low + 1;
+ }
+
+ T ly = y - y_low;
+ T lx = x - x_low;
+ T hy = 1. - ly, hx = 1. - lx;
+ // do bilinear interpolation
+ T v1 = bottom_data[y_low * width + x_low];
+ T v2 = bottom_data[y_low * width + x_high];
+ T v3 = bottom_data[y_high * width + x_low];
+ T v4 = bottom_data[y_high * width + x_high];
+ T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+ T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+ return val;
+}
+
+template
+__global__ void RoIAlignForward(const int nthreads, const T* bottom_data,
+ const T spatial_scale, const int channels,
+ const int height, const int width,
+ const int pooled_height, const int pooled_width,
+ const int sampling_ratio,
+ const T* bottom_rois, T* top_data) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (n, c, ph, pw) is an element in the pooled output
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+
+ const T* offset_bottom_rois = bottom_rois + n * 5;
+ int roi_batch_ind = offset_bottom_rois[0];
+
+ // Do not using rounding; this implementation detail is critical
+ T roi_start_w = offset_bottom_rois[1] * spatial_scale;
+ T roi_start_h = offset_bottom_rois[2] * spatial_scale;
+ T roi_end_w = offset_bottom_rois[3] * spatial_scale;
+ T roi_end_h = offset_bottom_rois[4] * spatial_scale;
+ // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
+ // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
+ // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
+ // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
+
+ // Force malformed ROIs to be 1x1
+ T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+ T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
+
+ const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+ // We use roi_bin_grid to sample the grid and mimic integral
+ int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
+ int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+ // We do average (integral) pooling inside a bin
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+ T output_val = 0.;
+ for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
+ {
+ const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
+ for (int ix = 0; ix < roi_bin_grid_w; ix ++)
+ {
+ const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w);
+
+ T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, index);
+ output_val += val;
+ }
+ }
+ output_val /= count;
+
+ top_data[index] = output_val;
+ }
+}
+
+
+template
+__device__ void bilinear_interpolate_gradient(
+ const int height, const int width,
+ T y, T x,
+ T & w1, T & w2, T & w3, T & w4,
+ int & x_low, int & x_high, int & y_low, int & y_high,
+ const int index /* index for debug only*/) {
+
+ // deal with cases that inverse elements are out of feature map boundary
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
+ //empty
+ w1 = w2 = w3 = w4 = 0.;
+ x_low = x_high = y_low = y_high = -1;
+ return;
+ }
+
+ if (y <= 0) y = 0;
+ if (x <= 0) x = 0;
+
+ y_low = (int) y;
+ x_low = (int) x;
+
+ if (y_low >= height - 1) {
+ y_high = y_low = height - 1;
+ y = (T) y_low;
+ } else {
+ y_high = y_low + 1;
+ }
+
+ if (x_low >= width - 1) {
+ x_high = x_low = width - 1;
+ x = (T) x_low;
+ } else {
+ x_high = x_low + 1;
+ }
+
+ T ly = y - y_low;
+ T lx = x - x_low;
+ T hy = 1. - ly, hx = 1. - lx;
+
+ // reference in forward
+ // T v1 = bottom_data[y_low * width + x_low];
+ // T v2 = bottom_data[y_low * width + x_high];
+ // T v3 = bottom_data[y_high * width + x_low];
+ // T v4 = bottom_data[y_high * width + x_high];
+ // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+ w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+ return;
+}
+
+template
+__global__ void RoIAlignBackwardFeature(const int nthreads, const T* top_diff,
+ const int num_rois, const T spatial_scale,
+ const int channels, const int height, const int width,
+ const int pooled_height, const int pooled_width,
+ const int sampling_ratio,
+ T* bottom_diff,
+ const T* bottom_rois) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (n, c, ph, pw) is an element in the pooled output
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+
+ const T* offset_bottom_rois = bottom_rois + n * 5;
+ int roi_batch_ind = offset_bottom_rois[0];
+
+ // Do not using rounding; this implementation detail is critical
+ T roi_start_w = offset_bottom_rois[1] * spatial_scale;
+ T roi_start_h = offset_bottom_rois[2] * spatial_scale;
+ T roi_end_w = offset_bottom_rois[3] * spatial_scale;
+ T roi_end_h = offset_bottom_rois[4] * spatial_scale;
+ // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
+ // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
+ // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
+ // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
+
+ // Force malformed ROIs to be 1x1
+ T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+ T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+ T bin_size_h = static_cast(roi_height) / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width) / static_cast(pooled_width);
+
+ T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+ int top_offset = (n * channels + c) * pooled_height * pooled_width;
+ const T* offset_top_diff = top_diff + top_offset;
+ const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+ // We use roi_bin_grid to sample the grid and mimic integral
+ int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
+ int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+ // We do average (integral) pooling inside a bin
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+ for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
+ {
+ const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5
+ for (int ix = 0; ix < roi_bin_grid_w; ix ++)
+ {
+ const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w);
+
+ T w1, w2, w3, w4;
+ int x_low, x_high, y_low, y_high;
+
+ bilinear_interpolate_gradient(height, width, y, x,
+ w1, w2, w3, w4,
+ x_low, x_high, y_low, y_high,
+ index);
+
+ T g1 = top_diff_this_bin * w1 / count;
+ T g2 = top_diff_this_bin * w2 / count;
+ T g3 = top_diff_this_bin * w3 / count;
+ T g4 = top_diff_this_bin * w4 / count;
+
+ if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0)
+ {
+ atomicAdd(offset_bottom_diff + y_low * width + x_low, static_cast(g1));
+ atomicAdd(offset_bottom_diff + y_low * width + x_high, static_cast(g2));
+ atomicAdd(offset_bottom_diff + y_high * width + x_low, static_cast(g3));
+ atomicAdd(offset_bottom_diff + y_high * width + x_high, static_cast(g4));
+ } // if
+ } // ix
+ } // iy
+ } // CUDA_1D_KERNEL_LOOP
+} // RoIAlignBackward
+
+
+at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio) {
+ AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+ auto num_rois = rois.size(0);
+ auto channels = input.size(1);
+ auto height = input.size(2);
+ auto width = input.size(3);
+
+ auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
+ dim3 block(512);
+
+ if (output.numel() == 0) {
+ THCudaCheck(cudaGetLastError());
+ return output;
+ }
+
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
+ RoIAlignForward<<>>(
+ output_size,
+ input.contiguous().data_ptr(),
+ spatial_scale,
+ channels,
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ sampling_ratio,
+ rois.contiguous().data_ptr(),
+ output.data_ptr());
+ });
+ THCudaCheck(cudaGetLastError());
+ return output;
+}
+
+// TODO remove the dependency on input and use instead its sizes -> save memory
+at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int batch_size,
+ const int channels,
+ const int height,
+ const int width,
+ const int sampling_ratio) {
+ AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+ auto num_rois = rois.size(0);
+ auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
+ dim3 block(512);
+
+ // handle possibly empty gradients
+ if (grad.numel() == 0) {
+ THCudaCheck(cudaGetLastError());
+ return grad_input;
+ }
+
+ AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] {
+ RoIAlignBackwardFeature<<>>(
+ grad.numel(),
+ grad.contiguous().data_ptr(),
+ num_rois,
+ spatial_scale,
+ channels,
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ sampling_ratio,
+ grad_input.data_ptr(),
+ rois.contiguous().data_ptr());
+ });
+ THCudaCheck(cudaGetLastError());
+ return grad_input;
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..60fc9fbc55956304c7ff6b48cbf3c086029b8354
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu
@@ -0,0 +1,202 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include
+#include
+
+#include
+#include
+#include
+
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+ i += blockDim.x * gridDim.x)
+
+
+template
+__global__ void RoIPoolFForward(const int nthreads, const T* bottom_data,
+ const T spatial_scale, const int channels, const int height,
+ const int width, const int pooled_height, const int pooled_width,
+ const T* bottom_rois, T* top_data, int* argmax_data) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (n, c, ph, pw) is an element in the pooled output
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+
+ const T* offset_bottom_rois = bottom_rois + n * 5;
+ int roi_batch_ind = offset_bottom_rois[0];
+ int roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
+ int roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
+ int roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
+ int roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
+
+ // Force malformed ROIs to be 1x1
+ int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+ int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+ T bin_size_h = static_cast(roi_height)
+ / static_cast(pooled_height);
+ T bin_size_w = static_cast(roi_width)
+ / static_cast(pooled_width);
+
+ int hstart = static_cast(floor(static_cast(ph)
+ * bin_size_h));
+ int wstart = static_cast(floor(static_cast(pw)
+ * bin_size_w));
+ int hend = static_cast(ceil(static_cast(ph + 1)
+ * bin_size_h));
+ int wend = static_cast(ceil(static_cast(pw + 1)
+ * bin_size_w));
+
+ // Add roi offsets and clip to input boundaries
+ hstart = min(max(hstart + roi_start_h, 0), height);
+ hend = min(max(hend + roi_start_h, 0), height);
+ wstart = min(max(wstart + roi_start_w, 0), width);
+ wend = min(max(wend + roi_start_w, 0), width);
+ bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+ // Define an empty pooling region to be zero
+ T maxval = is_empty ? 0 : -FLT_MAX;
+ // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+ int maxidx = -1;
+ const T* offset_bottom_data =
+ bottom_data + (roi_batch_ind * channels + c) * height * width;
+ for (int h = hstart; h < hend; ++h) {
+ for (int w = wstart; w < wend; ++w) {
+ int bottom_index = h * width + w;
+ if (offset_bottom_data[bottom_index] > maxval) {
+ maxval = offset_bottom_data[bottom_index];
+ maxidx = bottom_index;
+ }
+ }
+ }
+ top_data[index] = maxval;
+ argmax_data[index] = maxidx;
+ }
+}
+
+template
+__global__ void RoIPoolFBackward(const int nthreads, const T* top_diff,
+ const int* argmax_data, const int num_rois, const T spatial_scale,
+ const int channels, const int height, const int width,
+ const int pooled_height, const int pooled_width, T* bottom_diff,
+ const T* bottom_rois) {
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
+ // (n, c, ph, pw) is an element in the pooled output
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int c = (index / pooled_width / pooled_height) % channels;
+ int n = index / pooled_width / pooled_height / channels;
+
+ const T* offset_bottom_rois = bottom_rois + n * 5;
+ int roi_batch_ind = offset_bottom_rois[0];
+ int bottom_offset = (roi_batch_ind * channels + c) * height * width;
+ int top_offset = (n * channels + c) * pooled_height * pooled_width;
+ const T* offset_top_diff = top_diff + top_offset;
+ T* offset_bottom_diff = bottom_diff + bottom_offset;
+ const int* offset_argmax_data = argmax_data + top_offset;
+
+ int argmax = offset_argmax_data[ph * pooled_width + pw];
+ if (argmax != -1) {
+ atomicAdd(
+ offset_bottom_diff + argmax,
+ static_cast(offset_top_diff[ph * pooled_width + pw]));
+
+ }
+ }
+}
+
+std::tuple ROIPool_forward_cuda(const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width) {
+ AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+
+ auto num_rois = rois.size(0);
+ auto channels = input.size(1);
+ auto height = input.size(2);
+ auto width = input.size(3);
+
+ auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
+ auto argmax = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options().dtype(at::kInt));
+
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
+ dim3 block(512);
+
+ if (output.numel() == 0) {
+ THCudaCheck(cudaGetLastError());
+ return std::make_tuple(output, argmax);
+ }
+
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIPool_forward", [&] {
+ RoIPoolFForward<<>>(
+ output_size,
+ input.contiguous().data_ptr(),
+ spatial_scale,
+ channels,
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ rois.contiguous().data_ptr(),
+ output.data_ptr(),
+ argmax.data_ptr());
+ });
+ THCudaCheck(cudaGetLastError());
+ return std::make_tuple(output, argmax);
+}
+
+// TODO remove the dependency on input and use instead its sizes -> save memory
+at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
+ const at::Tensor& input,
+ const at::Tensor& rois,
+ const at::Tensor& argmax,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int batch_size,
+ const int channels,
+ const int height,
+ const int width) {
+ AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+ // TODO add more checks
+
+ auto num_rois = rois.size(0);
+ auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
+ dim3 block(512);
+
+ // handle possibly empty gradients
+ if (grad.numel() == 0) {
+ THCudaCheck(cudaGetLastError());
+ return grad_input;
+ }
+
+ AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIPool_backward", [&] {
+ RoIPoolFBackward<<>>(
+ grad.numel(),
+ grad.contiguous().data_ptr(),
+ argmax.data_ptr(),
+ num_rois,
+ spatial_scale,
+ channels,
+ height,
+ width,
+ pooled_height,
+ pooled_width,
+ grad_input.data_ptr(),
+ rois.contiguous().data_ptr());
+ });
+ THCudaCheck(cudaGetLastError());
+ return grad_input;
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8aeceae0f825598cd36ea99add8da613c5e2482a
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu
@@ -0,0 +1,188 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This file is modified from https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu
+// Cheng-Yang Fu
+// cyfu@cs.unc.edu
+#include
+#include
+
+#include
+#include
+#include
+
+#include
+
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+ i += blockDim.x * gridDim.x)
+
+
+template
+__global__ void SigmoidFocalLossForward(const int nthreads,
+ const T* logits,
+ const int* targets,
+ const int num_classes,
+ const float gamma,
+ const float alpha,
+ const int num,
+ T* losses) {
+ CUDA_1D_KERNEL_LOOP(i, nthreads) {
+
+ int n = i / num_classes;
+ int d = i % num_classes; // current class[0~79];
+ int t = targets[n]; // target class [1~80];
+
+ // Decide it is positive or negative case.
+ T c1 = (t == (d+1));
+ T c2 = (t>=0 & t != (d+1));
+
+ T zn = (1.0 - alpha);
+ T zp = (alpha);
+
+ // p = 1. / 1. + expf(-x); p = sigmoid(x)
+ T p = 1. / (1. + expf(-logits[i]));
+
+ // (1-p)**gamma * log(p) where
+ T term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN));
+
+ // p**gamma * log(1-p)
+ T term2 = powf(p, gamma) *
+ (-1. * logits[i] * (logits[i] >= 0) -
+ logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));
+
+ losses[i] = 0.0;
+ losses[i] += -c1 * term1 * zp;
+ losses[i] += -c2 * term2 * zn;
+
+ } // CUDA_1D_KERNEL_LOOP
+} // SigmoidFocalLossForward
+
+
+template
+__global__ void SigmoidFocalLossBackward(const int nthreads,
+ const T* logits,
+ const int* targets,
+ const T* d_losses,
+ const int num_classes,
+ const float gamma,
+ const float alpha,
+ const int num,
+ T* d_logits) {
+ CUDA_1D_KERNEL_LOOP(i, nthreads) {
+
+ int n = i / num_classes;
+ int d = i % num_classes; // current class[0~79];
+ int t = targets[n]; // target class [1~80], 0 is background;
+
+ // Decide it is positive or negative case.
+ T c1 = (t == (d+1));
+ T c2 = (t>=0 & t != (d+1));
+
+ T zn = (1.0 - alpha);
+ T zp = (alpha);
+ // p = 1. / 1. + expf(-x); p = sigmoid(x)
+ T p = 1. / (1. + expf(-logits[i]));
+
+ // (1-p)**g * (1 - p - g*p*log(p)
+ T term1 = powf((1. - p), gamma) *
+ (1. - p - (p * gamma * logf(max(p, FLT_MIN))));
+
+ // (p**g) * (g*(1-p)*log(1-p) - p)
+ T term2 = powf(p, gamma) *
+ ((-1. * logits[i] * (logits[i] >= 0) -
+ logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
+ (1. - p) * gamma - p);
+ d_logits[i] = 0.0;
+ d_logits[i] += -c1 * term1 * zp;
+ d_logits[i] += -c2 * term2 * zn;
+ d_logits[i] = d_logits[i] * d_losses[i];
+
+ } // CUDA_1D_KERNEL_LOOP
+} // SigmoidFocalLossBackward
+
+
+at::Tensor SigmoidFocalLoss_forward_cuda(
+ const at::Tensor& logits,
+ const at::Tensor& targets,
+ const int num_classes,
+ const float gamma,
+ const float alpha) {
+ AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
+ AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
+ AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
+
+ const int num_samples = logits.size(0);
+
+ auto losses = at::empty({num_samples, logits.size(1)}, logits.options());
+ auto losses_size = num_samples * logits.size(1);
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ dim3 grid(std::min(THCCeilDiv(losses_size, 512L), 4096L));
+ dim3 block(512);
+
+ if (losses.numel() == 0) {
+ THCudaCheck(cudaGetLastError());
+ return losses;
+ }
+
+ AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_forward", [&] {
+ SigmoidFocalLossForward<<>>(
+ losses_size,
+ logits.contiguous().data_ptr(),
+ targets.contiguous().data_ptr(),
+ num_classes,
+ gamma,
+ alpha,
+ num_samples,
+ losses.data_ptr());
+ });
+ THCudaCheck(cudaGetLastError());
+ return losses;
+}
+
+
+at::Tensor SigmoidFocalLoss_backward_cuda(
+ const at::Tensor& logits,
+ const at::Tensor& targets,
+ const at::Tensor& d_losses,
+ const int num_classes,
+ const float gamma,
+ const float alpha) {
+ AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
+ AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
+ AT_ASSERTM(d_losses.device().is_cuda(), "d_losses must be a CUDA tensor");
+
+ AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
+
+ const int num_samples = logits.size(0);
+ AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes");
+
+ auto d_logits = at::zeros({num_samples, num_classes}, logits.options());
+ auto d_logits_size = num_samples * logits.size(1);
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ dim3 grid(std::min(THCCeilDiv(d_logits_size, 512L), 4096L));
+ dim3 block(512);
+
+ if (d_logits.numel() == 0) {
+ THCudaCheck(cudaGetLastError());
+ return d_logits;
+ }
+
+ AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_backward", [&] {
+ SigmoidFocalLossBackward<<>>(
+ d_logits_size,
+ logits.contiguous().data_ptr(),
+ targets.contiguous().data_ptr(),
+ d_losses.contiguous().data_ptr(),
+ num_classes,
+ gamma,
+ alpha,
+ num_samples,
+ d_logits.data_ptr());
+ });
+
+ THCudaCheck(cudaGetLastError());
+ return d_logits;
+}
+
diff --git a/maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu b/maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2cdf8d61957e50d452dd230c97b5754dacd2fa0e
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu
@@ -0,0 +1,691 @@
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
+
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+#include
+
+
+void deformable_im2col(const at::Tensor data_im, const at::Tensor data_offset,
+ const int channels, const int height, const int width,
+ const int ksize_h, const int ksize_w, const int pad_h,
+ const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int parallel_imgs, const int deformable_group,
+ at::Tensor data_col);
+
+void deformable_col2im(const at::Tensor data_col, const at::Tensor data_offset,
+ const int channels, const int height, const int width,
+ const int ksize_h, const int ksize_w, const int pad_h,
+ const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int parallel_imgs, const int deformable_group,
+ at::Tensor grad_im);
+
+void deformable_col2im_coord(
+ const at::Tensor data_col, const at::Tensor data_im,
+ const at::Tensor data_offset, const int channels, const int height,
+ const int width, const int ksize_h, const int ksize_w, const int pad_h,
+ const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w, const int parallel_imgs,
+ const int deformable_group, at::Tensor grad_offset);
+
+void modulated_deformable_im2col_cuda(
+ const at::Tensor data_im, const at::Tensor data_offset,
+ const at::Tensor data_mask, const int batch_size, const int channels,
+ const int height_im, const int width_im, const int height_col,
+ const int width_col, const int kernel_h, const int kenerl_w,
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w, const int deformable_group,
+ at::Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+ const at::Tensor data_col, const at::Tensor data_offset,
+ const at::Tensor data_mask, const int batch_size, const int channels,
+ const int height_im, const int width_im, const int height_col,
+ const int width_col, const int kernel_h, const int kenerl_w,
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w, const int deformable_group,
+ at::Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+ const at::Tensor data_col, const at::Tensor data_im,
+ const at::Tensor data_offset, const at::Tensor data_mask,
+ const int batch_size, const int channels, const int height_im,
+ const int width_im, const int height_col, const int width_col,
+ const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w, const int dilation_h,
+ const int dilation_w, const int deformable_group, at::Tensor grad_offset,
+ at::Tensor grad_mask);
+
+void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
+ at::Tensor weight, int kH, int kW, int dH, int dW, int padH,
+ int padW, int dilationH, int dilationW, int group,
+ int deformable_group)
+{
+ TORCH_CHECK(weight.ndimension() == 4,
+ "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
+ "but got: %s",
+ weight.ndimension());
+
+ TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+ TORCH_CHECK(kW > 0 && kH > 0,
+ "kernel size should be greater than zero, but got kH: %d kW: %d", kH,
+ kW);
+
+ TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+ "kernel size should be consistent with weight, ",
+ "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
+ kW, weight.size(2), weight.size(3));
+
+ TORCH_CHECK(dW > 0 && dH > 0,
+ "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+ TORCH_CHECK(
+ dilationW > 0 && dilationH > 0,
+ "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+ dilationH, dilationW);
+
+ int ndim = input.ndimension();
+ int dimf = 0;
+ int dimh = 1;
+ int dimw = 2;
+
+ if (ndim == 4) {
+ dimf++;
+ dimh++;
+ dimw++;
+ }
+
+ TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
+ ndim);
+
+ long nInputPlane = weight.size(1) * group;
+ long inputHeight = input.size(dimh);
+ long inputWidth = input.size(dimw);
+ long nOutputPlane = weight.size(0);
+ long outputHeight =
+ (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+ long outputWidth =
+ (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+ TORCH_CHECK(nInputPlane % deformable_group == 0,
+ "input channels must divide deformable group size");
+
+ if (outputWidth < 1 || outputHeight < 1)
+ AT_ERROR(
+ "Given input size: (%ld x %ld x %ld). "
+ "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+ nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+ outputWidth);
+
+ TORCH_CHECK(input.size(1) == nInputPlane,
+ "invalid number of input planes, expected: %d, but got: %d",
+ nInputPlane, input.size(1));
+
+ TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
+ "input image is smaller than kernel");
+
+ TORCH_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+ "invalid spatial size of offset, expected height: %d width: %d, but "
+ "got height: %d width: %d",
+ outputHeight, outputWidth, offset.size(2), offset.size(3));
+
+ TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+ "invalid number of channels of offset");
+
+ if (gradOutput != NULL) {
+ TORCH_CHECK(gradOutput->size(dimf) == nOutputPlane,
+ "invalid number of gradOutput planes, expected: %d, but got: %d",
+ nOutputPlane, gradOutput->size(dimf));
+
+ TORCH_CHECK((gradOutput->size(dimh) == outputHeight &&
+ gradOutput->size(dimw) == outputWidth),
+ "invalid size of gradOutput, expected height: %d width: %d , but "
+ "got height: %d width: %d",
+ outputHeight, outputWidth, gradOutput->size(dimh),
+ gradOutput->size(dimw));
+ }
+}
+
+int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
+ at::Tensor offset, at::Tensor output,
+ at::Tensor columns, at::Tensor ones, int kW,
+ int kH, int dW, int dH, int padW, int padH,
+ int dilationW, int dilationH, int group,
+ int deformable_group, int im2col_step)
+{
+ // todo: resize columns to include im2col: done
+ // todo: add im2col_step as input
+ // todo: add new output buffer and transpose it to output (or directly
+ // transpose output) todo: possibly change data indexing because of
+ // parallel_imgs
+
+ shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH, padW,
+ dilationH, dilationW, group, deformable_group);
+
+ input = input.contiguous();
+ offset = offset.contiguous();
+ weight = weight.contiguous();
+
+ int batch = 1;
+ if (input.ndimension() == 3) {
+ // Force batch
+ batch = 0;
+ input.unsqueeze_(0);
+ offset.unsqueeze_(0);
+ }
+
+ // todo: assert batchsize dividable by im2col_step
+
+ long batchSize = input.size(0);
+ long nInputPlane = input.size(1);
+ long inputHeight = input.size(2);
+ long inputWidth = input.size(3);
+
+ long nOutputPlane = weight.size(0);
+
+ long outputWidth =
+ (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+ long outputHeight =
+ (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+ TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+ output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
+ outputHeight, outputWidth});
+ columns = at::zeros(
+ {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+ input.options());
+
+ if (ones.ndimension() != 2 ||
+ ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+ ones = at::ones({outputHeight, outputWidth}, input.options());
+ }
+
+ input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+ inputHeight, inputWidth});
+ offset =
+ offset.view({batchSize / im2col_step, im2col_step,
+ deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+ at::Tensor output_buffer =
+ at::zeros({batchSize / im2col_step, nOutputPlane,
+ im2col_step * outputHeight, outputWidth},
+ output.options());
+
+ output_buffer = output_buffer.view(
+ {output_buffer.size(0), group, output_buffer.size(1) / group,
+ output_buffer.size(2), output_buffer.size(3)});
+
+ for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+ deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+ inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+ dilationW, im2col_step, deformable_group, columns);
+
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+ weight = weight.view({group, weight.size(0) / group, weight.size(1),
+ weight.size(2), weight.size(3)});
+
+ for (int g = 0; g < group; g++) {
+ output_buffer[elt][g] = output_buffer[elt][g]
+ .flatten(1)
+ .addmm_(weight[g].flatten(1), columns[g])
+ .view_as(output_buffer[elt][g]);
+ }
+ }
+
+ output_buffer = output_buffer.view(
+ {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
+ output_buffer.size(3), output_buffer.size(4)});
+
+ output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
+ im2col_step, outputHeight, outputWidth});
+ output_buffer.transpose_(1, 2);
+ output.copy_(output_buffer);
+ output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+ input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+ offset = offset.view(
+ {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+ if (batch == 0) {
+ output = output.view({nOutputPlane, outputHeight, outputWidth});
+ input = input.view({nInputPlane, inputHeight, inputWidth});
+ offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+ }
+
+ return 1;
+}
+
+int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
+ at::Tensor gradOutput, at::Tensor gradInput,
+ at::Tensor gradOffset, at::Tensor weight,
+ at::Tensor columns, int kW, int kH, int dW,
+ int dH, int padW, int padH, int dilationW,
+ int dilationH, int group,
+ int deformable_group, int im2col_step)
+{
+ shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW, padH, padW,
+ dilationH, dilationW, group, deformable_group);
+
+ input = input.contiguous();
+ offset = offset.contiguous();
+ gradOutput = gradOutput.contiguous();
+ weight = weight.contiguous();
+
+ int batch = 1;
+
+ if (input.ndimension() == 3) {
+ // Force batch
+ batch = 0;
+ input = input.view({1, input.size(0), input.size(1), input.size(2)});
+ offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+ gradOutput = gradOutput.view(
+ {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+ }
+
+ long batchSize = input.size(0);
+ long nInputPlane = input.size(1);
+ long inputHeight = input.size(2);
+ long inputWidth = input.size(3);
+
+ long nOutputPlane = weight.size(0);
+
+ long outputWidth =
+ (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+ long outputHeight =
+ (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+ TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+ gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+ columns = at::zeros(
+ {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+ input.options());
+
+ // change order of grad output
+ gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+ nOutputPlane, outputHeight, outputWidth});
+ gradOutput.transpose_(1, 2);
+
+ gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
+ inputHeight, inputWidth});
+ input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+ inputHeight, inputWidth});
+ gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
+ deformable_group * 2 * kH * kW, outputHeight,
+ outputWidth});
+ offset =
+ offset.view({batchSize / im2col_step, im2col_step,
+ deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+ for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+ // divide into groups
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+ weight = weight.view({group, weight.size(0) / group, weight.size(1),
+ weight.size(2), weight.size(3)});
+ gradOutput = gradOutput.view(
+ {gradOutput.size(0), group, gradOutput.size(1) / group,
+ gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
+
+ for (int g = 0; g < group; g++) {
+ columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+ gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
+ }
+
+ columns =
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+ gradOutput = gradOutput.view(
+ {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
+ gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
+
+ deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
+ inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+ dilationH, dilationW, im2col_step, deformable_group,
+ gradOffset[elt]);
+
+ deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
+ inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+ dilationW, im2col_step, deformable_group, gradInput[elt]);
+ }
+
+ gradOutput.transpose_(1, 2);
+ gradOutput =
+ gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+ gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+ input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+ gradOffset = gradOffset.view(
+ {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+ offset = offset.view(
+ {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+ if (batch == 0) {
+ gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+ input = input.view({nInputPlane, inputHeight, inputWidth});
+ gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+ offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+ gradOffset =
+ gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+ }
+
+ return 1;
+}
+
+int deform_conv_backward_parameters_cuda(
+ at::Tensor input, at::Tensor offset, at::Tensor gradOutput,
+ at::Tensor gradWeight, // at::Tensor gradBias,
+ at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH,
+ int padW, int padH, int dilationW, int dilationH, int group,
+ int deformable_group, float scale, int im2col_step)
+{
+ // todo: transpose and reshape outGrad
+ // todo: reshape columns
+ // todo: add im2col_step as input
+
+ shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH, dW, padH,
+ padW, dilationH, dilationW, group, deformable_group);
+
+ input = input.contiguous();
+ offset = offset.contiguous();
+ gradOutput = gradOutput.contiguous();
+
+ int batch = 1;
+
+ if (input.ndimension() == 3) {
+ // Force batch
+ batch = 0;
+ input = input.view(
+ at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+ gradOutput = gradOutput.view(
+ {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+ }
+
+ long batchSize = input.size(0);
+ long nInputPlane = input.size(1);
+ long inputHeight = input.size(2);
+ long inputWidth = input.size(3);
+
+ long nOutputPlane = gradWeight.size(0);
+
+ long outputWidth =
+ (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+ long outputHeight =
+ (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+ TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+ columns = at::zeros(
+ {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+ input.options());
+
+ gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+ nOutputPlane, outputHeight, outputWidth});
+ gradOutput.transpose_(1, 2);
+
+ at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+ gradOutputBuffer =
+ gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+ outputHeight, outputWidth});
+ gradOutputBuffer.copy_(gradOutput);
+ gradOutputBuffer =
+ gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
+ im2col_step * outputHeight, outputWidth});
+
+ gradOutput.transpose_(1, 2);
+ gradOutput =
+ gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+ input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+ inputHeight, inputWidth});
+ offset =
+ offset.view({batchSize / im2col_step, im2col_step,
+ deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+ for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+ deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+ inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+ dilationW, im2col_step, deformable_group, columns);
+
+ // divide into group
+ gradOutputBuffer = gradOutputBuffer.view(
+ {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
+ gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+ gradWeight =
+ gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
+ gradWeight.size(2), gradWeight.size(3)});
+
+ for (int g = 0; g < group; g++) {
+ gradWeight[g] = gradWeight[g]
+ .flatten(1)
+ .addmm_(gradOutputBuffer[elt][g].flatten(1),
+ columns[g].transpose(1, 0), 1.0, scale)
+ .view_as(gradWeight[g]);
+ }
+ gradOutputBuffer = gradOutputBuffer.view(
+ {gradOutputBuffer.size(0),
+ gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+ gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
+ columns =
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+ gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+ gradWeight.size(2), gradWeight.size(3),
+ gradWeight.size(4)});
+ }
+
+ input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+ offset = offset.view(
+ {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+ if (batch == 0) {
+ gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+ input = input.view({nInputPlane, inputHeight, inputWidth});
+ }
+
+ return 1;
+}
+
+void modulated_deform_conv_cuda_forward(
+ at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
+ at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns,
+ int kernel_h, int kernel_w, const int stride_h, const int stride_w,
+ const int pad_h, const int pad_w, const int dilation_h,
+ const int dilation_w, const int group, const int deformable_group,
+ const bool with_bias)
+{
+ TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+ TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+ const int batch = input.size(0);
+ const int channels = input.size(1);
+ const int height = input.size(2);
+ const int width = input.size(3);
+
+ const int channels_out = weight.size(0);
+ const int channels_kernel = weight.size(1);
+ const int kernel_h_ = weight.size(2);
+ const int kernel_w_ = weight.size(3);
+
+ if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+ AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+ kernel_h_, kernel_w, kernel_h_, kernel_w_);
+ if (channels != channels_kernel * group)
+ AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
+ channels, channels_kernel * group);
+
+ const int height_out =
+ (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+ const int width_out =
+ (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+ if (ones.ndimension() != 2 ||
+ ones.size(0) * ones.size(1) < height_out * width_out) {
+ // Resize plane and fill with ones...
+ ones = at::ones({height_out, width_out}, input.options());
+ }
+
+ // resize output
+ output = output.view({batch, channels_out, height_out, width_out}).zero_();
+ // resize temporary columns
+ columns =
+ at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
+ input.options());
+
+ output = output.view({output.size(0), group, output.size(1) / group,
+ output.size(2), output.size(3)});
+
+ for (int b = 0; b < batch; b++) {
+ modulated_deformable_im2col_cuda(
+ input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+ width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+ dilation_h, dilation_w, deformable_group, columns);
+
+ // divide into group
+ weight = weight.view({group, weight.size(0) / group, weight.size(1),
+ weight.size(2), weight.size(3)});
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+ for (int g = 0; g < group; g++) {
+ output[b][g] = output[b][g]
+ .flatten(1)
+ .addmm_(weight[g].flatten(1), columns[g])
+ .view_as(output[b][g]);
+ }
+
+ weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+ weight.size(3), weight.size(4)});
+ columns =
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+ }
+
+ output = output.view({output.size(0), output.size(1) * output.size(2),
+ output.size(3), output.size(4)});
+
+ if (with_bias) {
+ output += bias.view({1, bias.size(0), 1, 1});
+ }
+}
+
+void modulated_deform_conv_cuda_backward(
+ at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
+ at::Tensor offset, at::Tensor mask, at::Tensor columns,
+ at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias,
+ at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output,
+ int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+ int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+ const bool with_bias)
+{
+ TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+ TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+ const int batch = input.size(0);
+ const int channels = input.size(1);
+ const int height = input.size(2);
+ const int width = input.size(3);
+
+ const int channels_kernel = weight.size(1);
+ const int kernel_h_ = weight.size(2);
+ const int kernel_w_ = weight.size(3);
+ if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+ AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+ kernel_h_, kernel_w, kernel_h_, kernel_w_);
+ if (channels != channels_kernel * group)
+ AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
+ channels, channels_kernel * group);
+
+ const int height_out =
+ (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+ const int width_out =
+ (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+ if (ones.ndimension() != 2 ||
+ ones.size(0) * ones.size(1) < height_out * width_out) {
+ // Resize plane and fill with ones...
+ ones = at::ones({height_out, width_out}, input.options());
+ }
+
+ grad_input = grad_input.view({batch, channels, height, width});
+ columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
+ input.options());
+
+ grad_output =
+ grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
+ grad_output.size(2), grad_output.size(3)});
+
+ for (int b = 0; b < batch; b++) {
+ // divide int group
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+ weight = weight.view({group, weight.size(0) / group, weight.size(1),
+ weight.size(2), weight.size(3)});
+
+ for (int g = 0; g < group; g++) {
+ columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+ grad_output[b][g].flatten(1), 0.0f, 1.0f);
+ }
+
+ columns =
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+ weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+ weight.size(3), weight.size(4)});
+
+ // gradient w.r.t. input coordinate data
+ modulated_deformable_col2im_coord_cuda(
+ columns, input[b], offset[b], mask[b], 1, channels, height, width,
+ height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+ stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+ grad_mask[b]);
+ // gradient w.r.t. input data
+ modulated_deformable_col2im_cuda(
+ columns, offset[b], mask[b], 1, channels, height, width, height_out,
+ width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+ dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+ // gradient w.r.t. weight, dWeight should accumulate across the batch and
+ // group
+ modulated_deformable_im2col_cuda(
+ input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+ width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+ dilation_h, dilation_w, deformable_group, columns);
+
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+ grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
+ grad_weight.size(1), grad_weight.size(2),
+ grad_weight.size(3)});
+ if (with_bias)
+ grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+ for (int g = 0; g < group; g++) {
+ grad_weight[g] =
+ grad_weight[g]
+ .flatten(1)
+ .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+ .view_as(grad_weight[g]);
+ if (with_bias) {
+ grad_bias[g] =
+ grad_bias[g]
+ .view({-1, 1})
+ .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+ .view(-1);
+ }
+ }
+
+ columns =
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+ grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+ grad_weight.size(2), grad_weight.size(3),
+ grad_weight.size(4)});
+ if (with_bias)
+ grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+ }
+ grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+ grad_output.size(2), grad_output.size(3),
+ grad_output.size(4)});
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/deform_conv_kernel_cuda.cu b/maskrcnn_benchmark/csrc/cuda/deform_conv_kernel_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ee15810103a4edaf213abdb222a70249d622c0f9
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/deform_conv_kernel_cuda.cu
@@ -0,0 +1,874 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+
+#include
+#include
+#include
+#include
+#include
+
+using namespace at;
+
+#define CUDA_KERNEL_LOOP(i, n) \
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+ i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+const int kMaxGridNum = 65535;
+inline int GET_BLOCKS(const int N)
+{
+ return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS);
+}
+
+/*
+const int CUDA_NUM_THREADS = 1024;
+
+inline int GET_BLOCKS(const int N)
+{
+ return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}*/
+
+template
+__device__ scalar_t deformable_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
+ const int height, const int width, scalar_t h, scalar_t w)
+{
+
+ int h_low = floor(h);
+ int w_low = floor(w);
+ int h_high = h_low + 1;
+ int w_high = w_low + 1;
+
+ scalar_t lh = h - h_low;
+ scalar_t lw = w - w_low;
+ scalar_t hh = 1 - lh, hw = 1 - lw;
+
+ scalar_t v1 = 0;
+ if (h_low >= 0 && w_low >= 0)
+ v1 = bottom_data[h_low * data_width + w_low];
+ scalar_t v2 = 0;
+ if (h_low >= 0 && w_high <= width - 1)
+ v2 = bottom_data[h_low * data_width + w_high];
+ scalar_t v3 = 0;
+ if (h_high <= height - 1 && w_low >= 0)
+ v3 = bottom_data[h_high * data_width + w_low];
+ scalar_t v4 = 0;
+ if (h_high <= height - 1 && w_high <= width - 1)
+ v4 = bottom_data[h_high * data_width + w_high];
+
+ scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+ scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+ return val;
+}
+
+template
+__device__ scalar_t get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
+ const int h, const int w, const int height, const int width)
+{
+
+ if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+ {
+ //empty
+ return 0;
+ }
+
+ int argmax_h_low = floor(argmax_h);
+ int argmax_w_low = floor(argmax_w);
+ int argmax_h_high = argmax_h_low + 1;
+ int argmax_w_high = argmax_w_low + 1;
+
+ scalar_t weight = 0;
+ if (h == argmax_h_low && w == argmax_w_low)
+ weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+ if (h == argmax_h_low && w == argmax_w_high)
+ weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+ if (h == argmax_h_high && w == argmax_w_low)
+ weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+ if (h == argmax_h_high && w == argmax_w_high)
+ weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+ return weight;
+}
+
+template
+__device__ scalar_t get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
+ const int height, const int width, const scalar_t *im_data,
+ const int data_width, const int bp_dir)
+{
+
+ if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+ {
+ //empty
+ return 0;
+ }
+
+ int argmax_h_low = floor(argmax_h);
+ int argmax_w_low = floor(argmax_w);
+ int argmax_h_high = argmax_h_low + 1;
+ int argmax_w_high = argmax_w_low + 1;
+
+ scalar_t weight = 0;
+
+ if (bp_dir == 0)
+ {
+ if (argmax_h_low >= 0 && argmax_w_low >= 0)
+ weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+ if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+ weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+ if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+ weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+ if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+ weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+ }
+ else if (bp_dir == 1)
+ {
+ if (argmax_h_low >= 0 && argmax_w_low >= 0)
+ weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+ if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+ weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+ if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+ weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+ if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+ weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+ }
+
+ return weight;
+}
+
+template
+__global__ void deformable_im2col_gpu_kernel(const int n, const scalar_t *data_im, const scalar_t *data_offset,
+ const int height, const int width, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w, const int channel_per_deformable_group,
+ const int batch_size, const int num_channels, const int deformable_group,
+ const int height_col, const int width_col,
+ scalar_t *data_col)
+{
+ CUDA_KERNEL_LOOP(index, n)
+ {
+ // index index of output matrix
+ const int w_col = index % width_col;
+ const int h_col = (index / width_col) % height_col;
+ const int b_col = (index / width_col / height_col) % batch_size;
+ const int c_im = (index / width_col / height_col) / batch_size;
+ const int c_col = c_im * kernel_h * kernel_w;
+
+ // compute deformable group index
+ const int deformable_group_index = c_im / channel_per_deformable_group;
+
+ const int h_in = h_col * stride_h - pad_h;
+ const int w_in = w_col * stride_w - pad_w;
+ scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+ //const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+ const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+ const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+ for (int i = 0; i < kernel_h; ++i)
+ {
+ for (int j = 0; j < kernel_w; ++j)
+ {
+ const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+ const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+ const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+ const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+ scalar_t val = static_cast(0);
+ const scalar_t h_im = h_in + i * dilation_h + offset_h;
+ const scalar_t w_im = w_in + j * dilation_w + offset_w;
+ if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+ {
+ //const scalar_t map_h = i * dilation_h + offset_h;
+ //const scalar_t map_w = j * dilation_w + offset_w;
+ //const int cur_height = height - h_in;
+ //const int cur_width = width - w_in;
+ //val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+ val = deformable_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
+ }
+ *data_col_ptr = val;
+ data_col_ptr += batch_size * height_col * width_col;
+ }
+ }
+ }
+}
+
+void deformable_im2col(
+ const at::Tensor data_im, const at::Tensor data_offset, const int channels,
+ const int height, const int width, const int ksize_h, const int ksize_w,
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w, const int parallel_imgs,
+ const int deformable_group, at::Tensor data_col)
+{
+ // num_axes should be smaller than block size
+ // todo: check parallel_imgs is correctly passed in
+ int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+ int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+ int num_kernels = channels * height_col * width_col * parallel_imgs;
+ int channel_per_deformable_group = channels / deformable_group;
+
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
+ const scalar_t *data_im_ = data_im.data_ptr();
+ const scalar_t *data_offset_ = data_offset.data_ptr();
+ scalar_t *data_col_ = data_col.data_ptr();
+
+ deformable_im2col_gpu_kernel<<>>(
+ num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w,
+ pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+ channel_per_deformable_group, parallel_imgs, channels, deformable_group,
+ height_col, width_col, data_col_);
+ }));
+
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess)
+ {
+ printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
+ }
+}
+
+template
+__global__ void deformable_col2im_gpu_kernel(
+ const int n, const scalar_t *data_col, const scalar_t *data_offset,
+ const int channels, const int height, const int width,
+ const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int channel_per_deformable_group,
+ const int batch_size, const int deformable_group,
+ const int height_col, const int width_col,
+ scalar_t *grad_im)
+{
+ CUDA_KERNEL_LOOP(index, n)
+ {
+ const int j = (index / width_col / height_col / batch_size) % kernel_w;
+ const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+ const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+ // compute the start and end of the output
+
+ const int deformable_group_index = c / channel_per_deformable_group;
+
+ int w_out = index % width_col;
+ int h_out = (index / width_col) % height_col;
+ int b = (index / width_col / height_col) % batch_size;
+ int w_in = w_out * stride_w - pad_w;
+ int h_in = h_out * stride_h - pad_h;
+
+ const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) *
+ 2 * kernel_h * kernel_w * height_col * width_col;
+ const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+ const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+ const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+ const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+ const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+ const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+ const scalar_t cur_top_grad = data_col[index];
+ const int cur_h = (int)cur_inv_h_data;
+ const int cur_w = (int)cur_inv_w_data;
+ for (int dy = -2; dy <= 2; dy++)
+ {
+ for (int dx = -2; dx <= 2; dx++)
+ {
+ if (cur_h + dy >= 0 && cur_h + dy < height &&
+ cur_w + dx >= 0 && cur_w + dx < width &&
+ abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+ abs(cur_inv_w_data - (cur_w + dx)) < 1)
+ {
+ int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+ scalar_t weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+ atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+ }
+ }
+ }
+ }
+}
+
+void deformable_col2im(
+ const at::Tensor data_col, const at::Tensor data_offset, const int channels,
+ const int height, const int width, const int ksize_h,
+ const int ksize_w, const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int parallel_imgs, const int deformable_group,
+ at::Tensor grad_im)
+{
+
+ // todo: make sure parallel_imgs is passed in correctly
+ int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+ int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+ int num_kernels = channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+ int channel_per_deformable_group = channels / deformable_group;
+
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+ const scalar_t *data_col_ = data_col.data_ptr();
+ const scalar_t *data_offset_ = data_offset.data_ptr();
+ scalar_t *grad_im_ = grad_im.data_ptr();
+
+ deformable_col2im_gpu_kernel<<>>(
+ num_kernels, data_col_, data_offset_, channels, height, width, ksize_h,
+ ksize_w, pad_h, pad_w, stride_h, stride_w,
+ dilation_h, dilation_w, channel_per_deformable_group,
+ parallel_imgs, deformable_group, height_col, width_col, grad_im_);
+ }));
+
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess)
+ {
+ printf("error in deformable_col2im: %s\n", cudaGetErrorString(err));
+ }
+}
+
+template
+__global__ void deformable_col2im_coord_gpu_kernel(const int n, const scalar_t *data_col,
+ const scalar_t *data_im, const scalar_t *data_offset,
+ const int channels, const int height, const int width,
+ const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int channel_per_deformable_group,
+ const int batch_size, const int offset_channels, const int deformable_group,
+ const int height_col, const int width_col, scalar_t *grad_offset)
+{
+ CUDA_KERNEL_LOOP(index, n)
+ {
+ scalar_t val = 0;
+ int w = index % width_col;
+ int h = (index / width_col) % height_col;
+ int c = (index / width_col / height_col) % offset_channels;
+ int b = (index / width_col / height_col) / offset_channels;
+ // compute the start and end of the output
+
+ const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+ const int col_step = kernel_h * kernel_w;
+ int cnt = 0;
+ const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group *
+ batch_size * width_col * height_col;
+ const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) *
+ channel_per_deformable_group / kernel_h / kernel_w * height * width;
+ const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 *
+ kernel_h * kernel_w * height_col * width_col;
+
+ const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+ for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+ {
+ const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+ const int bp_dir = offset_c % 2;
+
+ int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+ int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+ int w_out = col_pos % width_col;
+ int h_out = (col_pos / width_col) % height_col;
+ int w_in = w_out * stride_w - pad_w;
+ int h_in = h_out * stride_h - pad_h;
+ const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+ const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+ const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+ const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+ scalar_t inv_h = h_in + i * dilation_h + offset_h;
+ scalar_t inv_w = w_in + j * dilation_w + offset_w;
+ if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+ {
+ inv_h = inv_w = -2;
+ }
+ const scalar_t weight = get_coordinate_weight(
+ inv_h, inv_w,
+ height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+ val += weight * data_col_ptr[col_pos];
+ cnt += 1;
+ }
+
+ grad_offset[index] = val;
+ }
+}
+
+void deformable_col2im_coord(
+ const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset,
+ const int channels, const int height, const int width, const int ksize_h,
+ const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
+ const int stride_w, const int dilation_h, const int dilation_w,
+ const int parallel_imgs, const int deformable_group, at::Tensor grad_offset)
+{
+
+ int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+ int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+ int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w * deformable_group * parallel_imgs;
+ int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group;
+
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
+ const scalar_t *data_col_ = data_col.data_ptr();
+ const scalar_t *data_im_ = data_im.data_ptr();
+ const scalar_t *data_offset_ = data_offset.data_ptr();
+ scalar_t *grad_offset_ = grad_offset.data_ptr();
+
+ deformable_col2im_coord_gpu_kernel<<>>(
+ num_kernels, data_col_, data_im_, data_offset_, channels, height, width,
+ ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+ dilation_h, dilation_w, channel_per_deformable_group,
+ parallel_imgs, 2 * ksize_h * ksize_w * deformable_group, deformable_group,
+ height_col, width_col, grad_offset_);
+ }));
+}
+
+template
+__device__ scalar_t dmcn_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
+ const int height, const int width, scalar_t h, scalar_t w)
+{
+ int h_low = floor(h);
+ int w_low = floor(w);
+ int h_high = h_low + 1;
+ int w_high = w_low + 1;
+
+ scalar_t lh = h - h_low;
+ scalar_t lw = w - w_low;
+ scalar_t hh = 1 - lh, hw = 1 - lw;
+
+ scalar_t v1 = 0;
+ if (h_low >= 0 && w_low >= 0)
+ v1 = bottom_data[h_low * data_width + w_low];
+ scalar_t v2 = 0;
+ if (h_low >= 0 && w_high <= width - 1)
+ v2 = bottom_data[h_low * data_width + w_high];
+ scalar_t v3 = 0;
+ if (h_high <= height - 1 && w_low >= 0)
+ v3 = bottom_data[h_high * data_width + w_low];
+ scalar_t v4 = 0;
+ if (h_high <= height - 1 && w_high <= width - 1)
+ v4 = bottom_data[h_high * data_width + w_high];
+
+ scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+ scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+ return val;
+}
+
+template
+__device__ scalar_t dmcn_get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
+ const int h, const int w, const int height, const int width)
+{
+ if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+ {
+ //empty
+ return 0;
+ }
+
+ int argmax_h_low = floor(argmax_h);
+ int argmax_w_low = floor(argmax_w);
+ int argmax_h_high = argmax_h_low + 1;
+ int argmax_w_high = argmax_w_low + 1;
+
+ scalar_t weight = 0;
+ if (h == argmax_h_low && w == argmax_w_low)
+ weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+ if (h == argmax_h_low && w == argmax_w_high)
+ weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+ if (h == argmax_h_high && w == argmax_w_low)
+ weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+ if (h == argmax_h_high && w == argmax_w_high)
+ weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+ return weight;
+}
+
+template
+__device__ scalar_t dmcn_get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
+ const int height, const int width, const scalar_t *im_data,
+ const int data_width, const int bp_dir)
+{
+ if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+ {
+ //empty
+ return 0;
+ }
+
+ int argmax_h_low = floor(argmax_h);
+ int argmax_w_low = floor(argmax_w);
+ int argmax_h_high = argmax_h_low + 1;
+ int argmax_w_high = argmax_w_low + 1;
+
+ scalar_t weight = 0;
+
+ if (bp_dir == 0)
+ {
+ if (argmax_h_low >= 0 && argmax_w_low >= 0)
+ weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+ if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+ weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+ if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+ weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+ if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+ weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+ }
+ else if (bp_dir == 1)
+ {
+ if (argmax_h_low >= 0 && argmax_w_low >= 0)
+ weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+ if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+ weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+ if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+ weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+ if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+ weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+ }
+
+ return weight;
+}
+
+template
+__global__ void modulated_deformable_im2col_gpu_kernel(const int n,
+ const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask,
+ const int height, const int width, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int channel_per_deformable_group,
+ const int batch_size, const int num_channels, const int deformable_group,
+ const int height_col, const int width_col,
+ scalar_t *data_col)
+{
+ CUDA_KERNEL_LOOP(index, n)
+ {
+ // index index of output matrix
+ const int w_col = index % width_col;
+ const int h_col = (index / width_col) % height_col;
+ const int b_col = (index / width_col / height_col) % batch_size;
+ const int c_im = (index / width_col / height_col) / batch_size;
+ const int c_col = c_im * kernel_h * kernel_w;
+
+ // compute deformable group index
+ const int deformable_group_index = c_im / channel_per_deformable_group;
+
+ const int h_in = h_col * stride_h - pad_h;
+ const int w_in = w_col * stride_w - pad_w;
+
+ scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+ //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+ const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+ const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+ const scalar_t *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+ for (int i = 0; i < kernel_h; ++i)
+ {
+ for (int j = 0; j < kernel_w; ++j)
+ {
+ const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+ const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+ const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+ const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+ const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+ const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+ scalar_t val = static_cast(0);
+ const scalar_t h_im = h_in + i * dilation_h + offset_h;
+ const scalar_t w_im = w_in + j * dilation_w + offset_w;
+ //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+ if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+ {
+ //const float map_h = i * dilation_h + offset_h;
+ //const float map_w = j * dilation_w + offset_w;
+ //const int cur_height = height - h_in;
+ //const int cur_width = width - w_in;
+ //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+ val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
+ }
+ *data_col_ptr = val * mask;
+ data_col_ptr += batch_size * height_col * width_col;
+ //data_col_ptr += height_col * width_col;
+ }
+ }
+ }
+}
+
+template
+__global__ void modulated_deformable_col2im_gpu_kernel(const int n,
+ const scalar_t *data_col, const scalar_t *data_offset, const scalar_t *data_mask,
+ const int channels, const int height, const int width,
+ const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int channel_per_deformable_group,
+ const int batch_size, const int deformable_group,
+ const int height_col, const int width_col,
+ scalar_t *grad_im)
+{
+ CUDA_KERNEL_LOOP(index, n)
+ {
+ const int j = (index / width_col / height_col / batch_size) % kernel_w;
+ const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+ const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+ // compute the start and end of the output
+
+ const int deformable_group_index = c / channel_per_deformable_group;
+
+ int w_out = index % width_col;
+ int h_out = (index / width_col) % height_col;
+ int b = (index / width_col / height_col) % batch_size;
+ int w_in = w_out * stride_w - pad_w;
+ int h_in = h_out * stride_h - pad_h;
+
+ const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+ const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+ const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+ const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+ const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+ const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+ const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+ const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+ const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+ const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+ const scalar_t cur_top_grad = data_col[index] * mask;
+ const int cur_h = (int)cur_inv_h_data;
+ const int cur_w = (int)cur_inv_w_data;
+ for (int dy = -2; dy <= 2; dy++)
+ {
+ for (int dx = -2; dx <= 2; dx++)
+ {
+ if (cur_h + dy >= 0 && cur_h + dy < height &&
+ cur_w + dx >= 0 && cur_w + dx < width &&
+ abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+ abs(cur_inv_w_data - (cur_w + dx)) < 1)
+ {
+ int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+ scalar_t weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+ atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+ }
+ }
+ }
+ }
+}
+
+template
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
+ const scalar_t *data_col, const scalar_t *data_im,
+ const scalar_t *data_offset, const scalar_t *data_mask,
+ const int channels, const int height, const int width,
+ const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int channel_per_deformable_group,
+ const int batch_size, const int offset_channels, const int deformable_group,
+ const int height_col, const int width_col,
+ scalar_t *grad_offset, scalar_t *grad_mask)
+{
+ CUDA_KERNEL_LOOP(index, n)
+ {
+ scalar_t val = 0, mval = 0;
+ int w = index % width_col;
+ int h = (index / width_col) % height_col;
+ int c = (index / width_col / height_col) % offset_channels;
+ int b = (index / width_col / height_col) / offset_channels;
+ // compute the start and end of the output
+
+ const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+ const int col_step = kernel_h * kernel_w;
+ int cnt = 0;
+ const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
+ const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
+ const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+ const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+ const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+ for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+ {
+ const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+ const int bp_dir = offset_c % 2;
+
+ int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+ int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+ int w_out = col_pos % width_col;
+ int h_out = (col_pos / width_col) % height_col;
+ int w_in = w_out * stride_w - pad_w;
+ int h_in = h_out * stride_h - pad_h;
+ const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+ const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+ const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+ const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+ const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+ const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+ scalar_t inv_h = h_in + i * dilation_h + offset_h;
+ scalar_t inv_w = w_in + j * dilation_w + offset_w;
+ if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+ {
+ inv_h = inv_w = -2;
+ }
+ else
+ {
+ mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
+ }
+ const scalar_t weight = dmcn_get_coordinate_weight(
+ inv_h, inv_w,
+ height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+ val += weight * data_col_ptr[col_pos] * mask;
+ cnt += 1;
+ }
+ // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+ grad_offset[index] = val;
+ if (offset_c % 2 == 0)
+ // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
+ grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
+ }
+}
+
+void modulated_deformable_im2col_cuda(
+ const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
+ const int batch_size, const int channels, const int height_im, const int width_im,
+ const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int deformable_group, at::Tensor data_col)
+{
+ // num_axes should be smaller than block size
+ const int channel_per_deformable_group = channels / deformable_group;
+ const int num_kernels = channels * batch_size * height_col * width_col;
+
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
+ const scalar_t *data_im_ = data_im.data_ptr();
+ const scalar_t *data_offset_ = data_offset.data_ptr();
+ const scalar_t *data_mask_ = data_mask.data_ptr();
+ scalar_t *data_col_ = data_col.data_ptr();
+
+ modulated_deformable_im2col_gpu_kernel<<>>(
+ num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w,
+ pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+ batch_size, channels, deformable_group, height_col, width_col, data_col_);
+ }));
+
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess)
+ {
+ printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+ }
+}
+
+void modulated_deformable_col2im_cuda(
+ const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask,
+ const int batch_size, const int channels, const int height_im, const int width_im,
+ const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int deformable_group, at::Tensor grad_im)
+{
+
+ const int channel_per_deformable_group = channels / deformable_group;
+ const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
+ const scalar_t *data_col_ = data_col.data_ptr();
+ const scalar_t *data_offset_ = data_offset.data_ptr();
+ const scalar_t *data_mask_ = data_mask.data_ptr();
+ scalar_t *grad_im_ = grad_im.data_ptr();
+
+ modulated_deformable_col2im_gpu_kernel<<>>(
+ num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im,
+ kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
+ dilation_h, dilation_w, channel_per_deformable_group,
+ batch_size, deformable_group, height_col, width_col, grad_im_);
+ }));
+
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess)
+ {
+ printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+ }
+}
+
+void modulated_deformable_col2im_coord_cuda(
+ const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
+ const int batch_size, const int channels, const int height_im, const int width_im,
+ const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ const int deformable_group,
+ at::Tensor grad_offset, at::Tensor grad_mask)
+{
+ const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
+ const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
+
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
+ const scalar_t *data_col_ = data_col.data_ptr();
+ const scalar_t *data_im_ = data_im.data_ptr();
+ const scalar_t *data_offset_ = data_offset.data_ptr();
+ const scalar_t *data_mask_ = data_mask.data_ptr();
+ scalar_t *grad_offset_ = grad_offset.data_ptr();
+ scalar_t *grad_mask_ = grad_mask.data_ptr();
+
+ modulated_deformable_col2im_coord_gpu_kernel<<>>(
+ num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im,
+ kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+ dilation_h, dilation_w, channel_per_deformable_group,
+ batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col,
+ grad_offset_, grad_mask_);
+ }));
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess)
+ {
+ printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
+ }
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/deform_pool_cuda.cu b/maskrcnn_benchmark/csrc/cuda/deform_pool_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bbe22d77b49be70f174ae3f17647b09968358255
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/deform_pool_cuda.cu
@@ -0,0 +1,87 @@
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c
+
+// based on
+// author: Charles Shang
+// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
+
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+#include
+
+
+void DeformablePSROIPoolForward(
+ const at::Tensor data, const at::Tensor bbox, const at::Tensor trans,
+ at::Tensor out, at::Tensor top_count, const int batch, const int channels,
+ const int height, const int width, const int num_bbox,
+ const int channels_trans, const int no_trans, const float spatial_scale,
+ const int output_dim, const int group_size, const int pooled_size,
+ const int part_size, const int sample_per_part, const float trans_std);
+
+void DeformablePSROIPoolBackwardAcc(
+ const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox,
+ const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad,
+ at::Tensor trans_grad, const int batch, const int channels,
+ const int height, const int width, const int num_bbox,
+ const int channels_trans, const int no_trans, const float spatial_scale,
+ const int output_dim, const int group_size, const int pooled_size,
+ const int part_size, const int sample_per_part, const float trans_std);
+
+void deform_psroi_pooling_cuda_forward(
+ at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out,
+ at::Tensor top_count, const int no_trans, const float spatial_scale,
+ const int output_dim, const int group_size, const int pooled_size,
+ const int part_size, const int sample_per_part, const float trans_std)
+{
+ TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+
+ const int batch = input.size(0);
+ const int channels = input.size(1);
+ const int height = input.size(2);
+ const int width = input.size(3);
+ const int channels_trans = no_trans ? 2 : trans.size(1);
+
+ const int num_bbox = bbox.size(0);
+ if (num_bbox != out.size(0))
+ AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
+ out.size(0), num_bbox);
+
+ DeformablePSROIPoolForward(
+ input, bbox, trans, out, top_count, batch, channels, height, width,
+ num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size,
+ pooled_size, part_size, sample_per_part, trans_std);
+}
+
+void deform_psroi_pooling_cuda_backward(
+ at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans,
+ at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad,
+ const int no_trans, const float spatial_scale, const int output_dim,
+ const int group_size, const int pooled_size, const int part_size,
+ const int sample_per_part, const float trans_std)
+{
+ TORCH_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
+ TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+
+ const int batch = input.size(0);
+ const int channels = input.size(1);
+ const int height = input.size(2);
+ const int width = input.size(3);
+ const int channels_trans = no_trans ? 2 : trans.size(1);
+
+ const int num_bbox = bbox.size(0);
+ if (num_bbox != out_grad.size(0))
+ AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
+ out_grad.size(0), num_bbox);
+
+ DeformablePSROIPoolBackwardAcc(
+ out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch,
+ channels, height, width, num_bbox, channels_trans, no_trans,
+ spatial_scale, output_dim, group_size, pooled_size, part_size,
+ sample_per_part, trans_std);
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/deform_pool_kernel_cuda.cu b/maskrcnn_benchmark/csrc/cuda/deform_pool_kernel_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3f6c4cb22f6ecbae242e21c9530f474e709c6e90
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/deform_pool_kernel_cuda.cu
@@ -0,0 +1,365 @@
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file deformable_psroi_pooling.cu
+ * \brief
+ * \author Yi Li, Guodong Zhang, Jifeng Dai
+*/
+/***************** Adapted by Charles Shang *********************/
+// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/cuda/deform_psroi_pooling_cuda.cu
+
+
+#include
+#include
+#include
+#include
+#include
+
+using namespace at;
+
+#define CUDA_KERNEL_LOOP(i, n) \
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+ i < (n); \
+ i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N)
+{
+ return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+template
+__device__ scalar_t bilinear_interp(
+ const scalar_t *data,
+ const scalar_t x,
+ const scalar_t y,
+ const int width,
+ const int height)
+{
+ int x1 = floor(x);
+ int x2 = ceil(x);
+ int y1 = floor(y);
+ int y2 = ceil(y);
+ scalar_t dist_x = (scalar_t)(x - x1);
+ scalar_t dist_y = (scalar_t)(y - y1);
+ scalar_t value11 = data[y1 * width + x1];
+ scalar_t value12 = data[y2 * width + x1];
+ scalar_t value21 = data[y1 * width + x2];
+ scalar_t value22 = data[y2 * width + x2];
+ scalar_t value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22;
+ return value;
+}
+
+template
+__global__ void DeformablePSROIPoolForwardKernel(
+ const int count,
+ const scalar_t *bottom_data,
+ const scalar_t spatial_scale,
+ const int channels,
+ const int height, const int width,
+ const int pooled_height, const int pooled_width,
+ const scalar_t *bottom_rois, const scalar_t *bottom_trans,
+ const int no_trans,
+ const scalar_t trans_std,
+ const int sample_per_part,
+ const int output_dim,
+ const int group_size,
+ const int part_size,
+ const int num_classes,
+ const int channels_each_class,
+ scalar_t *top_data,
+ scalar_t *top_count)
+{
+ CUDA_KERNEL_LOOP(index, count)
+ {
+ // The output is in order (n, ctop, ph, pw)
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int ctop = (index / pooled_width / pooled_height) % output_dim;
+ int n = index / pooled_width / pooled_height / output_dim;
+
+ // [start, end) interval for spatial sampling
+ const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+ int roi_batch_ind = offset_bottom_rois[0];
+ scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+ scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+ scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+ scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+ // Force too small ROIs to be 1x1
+ scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+ scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+ // Compute w and h at bottom
+ scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
+ scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
+
+ scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
+ scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
+
+ int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
+ int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
+ int class_id = ctop / channels_each_class;
+ scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+ scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+
+ scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
+ wstart += trans_x * roi_width;
+ scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
+ hstart += trans_y * roi_height;
+
+ scalar_t sum = 0;
+ int count = 0;
+ int gw = floor((scalar_t)(pw)*group_size / pooled_width);
+ int gh = floor((scalar_t)(ph)*group_size / pooled_height);
+ gw = min(max(gw, 0), group_size - 1);
+ gh = min(max(gh, 0), group_size - 1);
+
+ const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
+ for (int ih = 0; ih < sample_per_part; ih++)
+ {
+ for (int iw = 0; iw < sample_per_part; iw++)
+ {
+ scalar_t w = wstart + iw * sub_bin_size_w;
+ scalar_t h = hstart + ih * sub_bin_size_h;
+ // bilinear interpolation
+ if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+ {
+ continue;
+ }
+ w = min(max(w, 0.), width - 1.);
+ h = min(max(h, 0.), height - 1.);
+ int c = (ctop * group_size + gh) * group_size + gw;
+ scalar_t val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height);
+ sum += val;
+ count++;
+ }
+ }
+ top_data[index] = count == 0 ? (scalar_t)(0) : sum / count;
+ top_count[index] = count;
+ }
+}
+
+template
+__global__ void DeformablePSROIPoolBackwardAccKernel(
+ const int count,
+ const scalar_t *top_diff,
+ const scalar_t *top_count,
+ const int num_rois,
+ const scalar_t spatial_scale,
+ const int channels,
+ const int height, const int width,
+ const int pooled_height, const int pooled_width,
+ const int output_dim,
+ scalar_t *bottom_data_diff, scalar_t *bottom_trans_diff,
+ const scalar_t *bottom_data,
+ const scalar_t *bottom_rois,
+ const scalar_t *bottom_trans,
+ const int no_trans,
+ const scalar_t trans_std,
+ const int sample_per_part,
+ const int group_size,
+ const int part_size,
+ const int num_classes,
+ const int channels_each_class)
+{
+ CUDA_KERNEL_LOOP(index, count)
+ {
+ // The output is in order (n, ctop, ph, pw)
+ int pw = index % pooled_width;
+ int ph = (index / pooled_width) % pooled_height;
+ int ctop = (index / pooled_width / pooled_height) % output_dim;
+ int n = index / pooled_width / pooled_height / output_dim;
+
+ // [start, end) interval for spatial sampling
+ const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+ int roi_batch_ind = offset_bottom_rois[0];
+ scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+ scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+ scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+ scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+ // Force too small ROIs to be 1x1
+ scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+ scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+ // Compute w and h at bottom
+ scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
+ scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
+
+ scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
+ scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
+
+ int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
+ int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
+ int class_id = ctop / channels_each_class;
+ scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+ scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+
+ scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
+ wstart += trans_x * roi_width;
+ scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
+ hstart += trans_y * roi_height;
+
+ if (top_count[index] <= 0)
+ {
+ continue;
+ }
+ scalar_t diff_val = top_diff[index] / top_count[index];
+ const scalar_t *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
+ scalar_t *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
+ int gw = floor((scalar_t)(pw)*group_size / pooled_width);
+ int gh = floor((scalar_t)(ph)*group_size / pooled_height);
+ gw = min(max(gw, 0), group_size - 1);
+ gh = min(max(gh, 0), group_size - 1);
+
+ for (int ih = 0; ih < sample_per_part; ih++)
+ {
+ for (int iw = 0; iw < sample_per_part; iw++)
+ {
+ scalar_t w = wstart + iw * sub_bin_size_w;
+ scalar_t h = hstart + ih * sub_bin_size_h;
+ // bilinear interpolation
+ if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+ {
+ continue;
+ }
+ w = min(max(w, 0.), width - 1.);
+ h = min(max(h, 0.), height - 1.);
+ int c = (ctop * group_size + gh) * group_size + gw;
+ // backward on feature
+ int x0 = floor(w);
+ int x1 = ceil(w);
+ int y0 = floor(h);
+ int y1 = ceil(h);
+ scalar_t dist_x = w - x0, dist_y = h - y0;
+ scalar_t q00 = (1 - dist_x) * (1 - dist_y);
+ scalar_t q01 = (1 - dist_x) * dist_y;
+ scalar_t q10 = dist_x * (1 - dist_y);
+ scalar_t q11 = dist_x * dist_y;
+ int bottom_index_base = c * height * width;
+ atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
+ atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
+ atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
+ atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
+
+ if (no_trans)
+ {
+ continue;
+ }
+ scalar_t U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
+ scalar_t U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
+ scalar_t U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
+ scalar_t U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
+ scalar_t diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
+ diff_x *= roi_width;
+ scalar_t diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
+ diff_y *= roi_height;
+
+ atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
+ atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
+ }
+ }
+ }
+}
+
+void DeformablePSROIPoolForward(const at::Tensor data,
+ const at::Tensor bbox,
+ const at::Tensor trans,
+ at::Tensor out,
+ at::Tensor top_count,
+ const int batch,
+ const int channels,
+ const int height,
+ const int width,
+ const int num_bbox,
+ const int channels_trans,
+ const int no_trans,
+ const float spatial_scale,
+ const int output_dim,
+ const int group_size,
+ const int pooled_size,
+ const int part_size,
+ const int sample_per_part,
+ const float trans_std)
+{
+ const int pooled_height = pooled_size;
+ const int pooled_width = pooled_size;
+ const int count = num_bbox * output_dim * pooled_height * pooled_width;
+ const int num_classes = no_trans ? 1 : channels_trans / 2;
+ const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ data.scalar_type(), "deformable_psroi_pool_forward", ([&] {
+ const scalar_t *bottom_data = data.data_ptr();
+ const scalar_t *bottom_rois = bbox.data_ptr();
+ const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr();
+ scalar_t *top_data = out.data_ptr();
+ scalar_t *top_count_data = top_count.data_ptr();
+
+ DeformablePSROIPoolForwardKernel<<>>(
+ count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width,
+ bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, output_dim,
+ group_size, part_size, num_classes, channels_each_class, top_data, top_count_data);
+ }));
+
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess)
+ {
+ printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
+ }
+}
+
+void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad,
+ const at::Tensor data,
+ const at::Tensor bbox,
+ const at::Tensor trans,
+ const at::Tensor top_count,
+ at::Tensor in_grad,
+ at::Tensor trans_grad,
+ const int batch,
+ const int channels,
+ const int height,
+ const int width,
+ const int num_bbox,
+ const int channels_trans,
+ const int no_trans,
+ const float spatial_scale,
+ const int output_dim,
+ const int group_size,
+ const int pooled_size,
+ const int part_size,
+ const int sample_per_part,
+ const float trans_std)
+{
+ // LOG(INFO) << "DeformablePSROIPoolBackward";
+ const int num_rois = num_bbox;
+ const int pooled_height = pooled_size;
+ const int pooled_width = pooled_size;
+ const int count = num_bbox * output_dim * pooled_height * pooled_width;
+ const int num_classes = no_trans ? 1 : channels_trans / 2;
+ const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+ AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+ out_grad.scalar_type(), "deformable_psroi_pool_backward_acc", ([&] {
+ const scalar_t *top_diff = out_grad.data_ptr();
+ const scalar_t *bottom_data = data.data_ptr();
+ const scalar_t *bottom_rois = bbox.data_ptr();
+ const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr();
+ scalar_t *bottom_data_diff = in_grad.data_ptr();
+ scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data_ptr();
+ const scalar_t *top_count_data = top_count.data_ptr();
+
+ DeformablePSROIPoolBackwardAccKernel<<>>(
+ count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width,
+ pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff,
+ bottom_data, bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part,
+ group_size, part_size, num_classes, channels_each_class);
+ }));
+
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess)
+ {
+ printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
+ }
+}
\ No newline at end of file
diff --git a/maskrcnn_benchmark/csrc/cuda/ml_nms.cu b/maskrcnn_benchmark/csrc/cuda/ml_nms.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cd958a0899a9e3adc69ca053170beb2b34fbd8ef
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/ml_nms.cu
@@ -0,0 +1,136 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+
+__device__ inline float devIoU(float const * const a, float const * const b) {
+ if (a[5] != b[5]) {
+ return 0.0;
+ }
+ float left = max(a[0], b[0]), right = min(a[2], b[2]);
+ float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+ float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+ float interS = width * height;
+ float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+ float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+ return interS / (Sa + Sb - interS);
+}
+
+__global__ void ml_nms_kernel(const int n_boxes, const float nms_overlap_thresh,
+ const float *dev_boxes, unsigned long long *dev_mask) {
+ const int row_start = blockIdx.y;
+ const int col_start = blockIdx.x;
+
+ // if (row_start > col_start) return;
+
+ const int row_size =
+ min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+ const int col_size =
+ min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+ __shared__ float block_boxes[threadsPerBlock * 6];
+ if (threadIdx.x < col_size) {
+ block_boxes[threadIdx.x * 6 + 0] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
+ block_boxes[threadIdx.x * 6 + 1] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
+ block_boxes[threadIdx.x * 6 + 2] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
+ block_boxes[threadIdx.x * 6 + 3] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
+ block_boxes[threadIdx.x * 6 + 4] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
+ block_boxes[threadIdx.x * 6 + 5] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5];
+ }
+ __syncthreads();
+
+ if (threadIdx.x < row_size) {
+ const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+ const float *cur_box = dev_boxes + cur_box_idx * 6;
+ int i = 0;
+ unsigned long long t = 0;
+ int start = 0;
+ if (row_start == col_start) {
+ start = threadIdx.x + 1;
+ }
+ for (i = start; i < col_size; i++) {
+ if (devIoU(cur_box, block_boxes + i * 6) > nms_overlap_thresh) {
+ t |= 1ULL << i;
+ }
+ }
+ const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
+ dev_mask[cur_box_idx * col_blocks + col_start] = t;
+ }
+}
+
+// boxes is a N x 6 tensor
+at::Tensor ml_nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
+ using scalar_t = float;
+ AT_ASSERTM(boxes.device().is_cuda(), "boxes must be a CUDA tensor");
+ auto scores = boxes.select(1, 4);
+ auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+ auto boxes_sorted = boxes.index_select(0, order_t);
+
+ int boxes_num = boxes.size(0);
+
+ const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
+
+ scalar_t* boxes_dev = boxes_sorted.data_ptr();
+
+ THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
+
+ unsigned long long* mask_dev = NULL;
+ //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
+ // boxes_num * col_blocks * sizeof(unsigned long long)));
+
+ mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
+
+ dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
+ THCCeilDiv(boxes_num, threadsPerBlock));
+ dim3 threads(threadsPerBlock);
+ ml_nms_kernel<<>>(boxes_num,
+ nms_overlap_thresh,
+ boxes_dev,
+ mask_dev);
+
+ std::vector mask_host(boxes_num * col_blocks);
+ THCudaCheck(cudaMemcpy(&mask_host[0],
+ mask_dev,
+ sizeof(unsigned long long) * boxes_num * col_blocks,
+ cudaMemcpyDeviceToHost));
+
+ std::vector remv(col_blocks);
+ memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+ at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
+ int64_t* keep_out = keep.data_ptr();
+
+ int num_to_keep = 0;
+ for (int i = 0; i < boxes_num; i++) {
+ int nblock = i / threadsPerBlock;
+ int inblock = i % threadsPerBlock;
+
+ if (!(remv[nblock] & (1ULL << inblock))) {
+ keep_out[num_to_keep++] = i;
+ unsigned long long *p = &mask_host[0] + i * col_blocks;
+ for (int j = nblock; j < col_blocks; j++) {
+ remv[j] |= p[j];
+ }
+ }
+ }
+
+ THCudaFree(state, mask_dev);
+ // TODO improve this part
+ return std::get<0>(order_t.index({
+ keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
+ order_t.device(), keep.scalar_type())
+ }).sort(0, false));
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/nms.cu b/maskrcnn_benchmark/csrc/cuda/nms.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d6221b85fa8f6b40cf498b76d6dbfc3c8438e25e
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/nms.cu
@@ -0,0 +1,131 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+
+__device__ inline float devIoU(float const * const a, float const * const b) {
+ float left = max(a[0], b[0]), right = min(a[2], b[2]);
+ float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+ float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+ float interS = width * height;
+ float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+ float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+ return interS / (Sa + Sb - interS);
+}
+
+__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
+ const float *dev_boxes, unsigned long long *dev_mask) {
+ const int row_start = blockIdx.y;
+ const int col_start = blockIdx.x;
+
+ // if (row_start > col_start) return;
+
+ const int row_size =
+ min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+ const int col_size =
+ min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+ __shared__ float block_boxes[threadsPerBlock * 5];
+ if (threadIdx.x < col_size) {
+ block_boxes[threadIdx.x * 5 + 0] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+ block_boxes[threadIdx.x * 5 + 1] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+ block_boxes[threadIdx.x * 5 + 2] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+ block_boxes[threadIdx.x * 5 + 3] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+ block_boxes[threadIdx.x * 5 + 4] =
+ dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+ }
+ __syncthreads();
+
+ if (threadIdx.x < row_size) {
+ const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+ const float *cur_box = dev_boxes + cur_box_idx * 5;
+ int i = 0;
+ unsigned long long t = 0;
+ int start = 0;
+ if (row_start == col_start) {
+ start = threadIdx.x + 1;
+ }
+ for (i = start; i < col_size; i++) {
+ if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+ t |= 1ULL << i;
+ }
+ }
+ const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
+ dev_mask[cur_box_idx * col_blocks + col_start] = t;
+ }
+}
+
+// boxes is a N x 5 tensor
+at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
+ using scalar_t = float;
+ AT_ASSERTM(boxes.device().is_cuda(), "boxes must be a CUDA tensor");
+ auto scores = boxes.select(1, 4);
+ auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+ auto boxes_sorted = boxes.index_select(0, order_t);
+
+ int boxes_num = boxes.size(0);
+
+ const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
+
+ scalar_t* boxes_dev = boxes_sorted.data_ptr();
+
+ THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
+
+ unsigned long long* mask_dev = NULL;
+ //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
+ // boxes_num * col_blocks * sizeof(unsigned long long)));
+
+ mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
+
+ dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
+ THCCeilDiv(boxes_num, threadsPerBlock));
+ dim3 threads(threadsPerBlock);
+ nms_kernel<<>>(boxes_num,
+ nms_overlap_thresh,
+ boxes_dev,
+ mask_dev);
+
+ std::vector mask_host(boxes_num * col_blocks);
+ THCudaCheck(cudaMemcpy(&mask_host[0],
+ mask_dev,
+ sizeof(unsigned long long) * boxes_num * col_blocks,
+ cudaMemcpyDeviceToHost));
+
+ std::vector remv(col_blocks);
+ memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+ at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
+ int64_t* keep_out = keep.data_ptr();
+
+ int num_to_keep = 0;
+ for (int i = 0; i < boxes_num; i++) {
+ int nblock = i / threadsPerBlock;
+ int inblock = i % threadsPerBlock;
+
+ if (!(remv[nblock] & (1ULL << inblock))) {
+ keep_out[num_to_keep++] = i;
+ unsigned long long *p = &mask_host[0] + i * col_blocks;
+ for (int j = nblock; j < col_blocks; j++) {
+ remv[j] |= p[j];
+ }
+ }
+ }
+
+ THCudaFree(state, mask_dev);
+ // TODO improve this part
+ return std::get<0>(order_t.index({
+ keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
+ order_t.device(), keep.scalar_type())
+ }).sort(0, false));
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/vision.h b/maskrcnn_benchmark/csrc/cuda/vision.h
new file mode 100644
index 0000000000000000000000000000000000000000..16a7f644ed5798d1917d32cda0590161b6da8c64
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/vision.h
@@ -0,0 +1,116 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+#include
+
+
+at::Tensor SigmoidFocalLoss_forward_cuda(
+ const at::Tensor& logits,
+ const at::Tensor& targets,
+ const int num_classes,
+ const float gamma,
+ const float alpha);
+
+at::Tensor SigmoidFocalLoss_backward_cuda(
+ const at::Tensor& logits,
+ const at::Tensor& targets,
+ const at::Tensor& d_losses,
+ const int num_classes,
+ const float gamma,
+ const float alpha);
+
+at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int sampling_ratio);
+
+at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
+ const at::Tensor& rois,
+ const float spatial_scale,
+ const int pooled_height,
+ const int pooled_width,
+ const int batch_size,
+ const int channels,
+ const int height,
+ const int width,
+ const int sampling_ratio);
+
+
+std::tuple