Spaces:

zdou0830
/

desco

Sleeping

App Files Files Community

zdou0830 commited on Dec 3, 2023

Commit

749745d

1 Parent(s): b4c3cb2

desco

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

MODEL/desco_glip_tiny.pth +3 -0
app.py +7 -4
coco_000000281759.jpg +275 -0
configs/flickr/test.yaml +22 -0
configs/flickr/val.yaml +22 -0
configs/lvis/minival.yaml +30 -0
configs/omnilabel/omnilabel_val_eval.yaml +18 -0
configs/pretrain/_coco.yaml +3 -0
configs/pretrain/fiber_cc.yaml +144 -0
configs/pretrain/fiber_tiny.yaml +157 -0
configs/pretrain/fiber_tiny_lr.yaml +157 -0
configs/pretrain/fibert_flickr_only.yaml +157 -0
configs/pretrain/glip_Swin_Flickr.yaml +116 -0
configs/pretrain/glip_Swin_L.yaml +120 -0
configs/pretrain/glip_Swin_T_O365.yaml +102 -0
configs/pretrain/glip_Swin_T_O365_GoldG.yaml +132 -0
configs/pretrain/glip_Swin_T_O365_GoldG_description.yaml +112 -0
configs/pretrain/glip_Swin_T_cc.yaml +116 -0
configs/pretrain/glip_Swin_T_cc_augv3.yaml +126 -0
configs/pretrain/glip_Swin_T_coco.yaml +100 -0
configs/pretrain/glip_Swing_T_flickr.yaml +116 -0
configs/pretrain/glip_large.yaml +120 -0
configs/pretrain/mixed_nococo_flickr_objects365.yaml +162 -0
configs/pretrain/mixed_nococo_flickr_objects365_refexpclean.yaml +162 -0
configs/pretrain_new/desco_fiber.yaml +168 -0
configs/pretrain_new/desco_glip.yaml +134 -0
configs/refcoco.yaml +116 -0
configs/refcocog.yaml +116 -0
configs/refcocoplus.yaml +116 -0
configs/refexp/_refcoco+_testA.yaml +30 -0
configs/refexp/_refcoco+_testB.yaml +30 -0
configs/refexp/_refcoco_testA.yaml +30 -0
configs/refexp/_refcoco_testB.yaml +30 -0
configs/refexp/_refcocog_test.yaml +30 -0
docs/intro.md +287 -0
maskrcnn_benchmark/__init__.py +1 -0
maskrcnn_benchmark/config/__init__.py +3 -0
maskrcnn_benchmark/config/defaults.py +982 -0
maskrcnn_benchmark/config/paths_catalog.py +779 -0
maskrcnn_benchmark/csrc/ROIAlign.h +46 -0
maskrcnn_benchmark/csrc/ROIPool.h +48 -0
maskrcnn_benchmark/csrc/SigmoidFocalLoss.h +41 -0
maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp +257 -0
maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp +75 -0
maskrcnn_benchmark/csrc/cpu/soft_nms.cpp +117 -0
maskrcnn_benchmark/csrc/cpu/vision.h +22 -0
maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu +346 -0
maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu +202 -0
maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu +188 -0
maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu +691 -0

MODEL/desco_glip_tiny.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:199479f67b5fbd4ab5e232c8fa8df3e9ab42a96966a023524c6cd95710ea5192
+size 3707483035

app.py CHANGED Viewed

@@ -19,9 +19,12 @@ from maskrcnn_benchmark.config import cfg
 from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo
 # Use this command for evaluate the GLIP-T model
-config_file = "configs/pretrain/glip_Swin_T_O365_GoldG.yaml"
 #weight_file = "MODEL/glip_tiny_model_o365_goldg_cc_sbu.pth"
 # Use this command if you want to try the GLIP-L model
 # ! wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/models/glip_large_model.pth -O MODEL/glip_large_model.pth
 # config_file = "configs/pretrain/glip_Swin_L.yaml"
@@ -61,12 +64,12 @@ gr.Interface(
         ),
     ],
     examples=[
-        ["./flickr_9472793441.jpg", "bobble heads on top of the shelf ."],
-        ["./flickr_9472793441.jpg", "sofa . remote . dog . person . car . sky . plane ."],
         ["./coco_000000281759.jpg", "A green umbrella. A pink striped umbrella. A plain white umbrella."],
         ["./coco_000000281759.jpg", "a flowery top. A blue dress. An orange shirt ."],
         ["./coco_000000281759.jpg", "a car . An electricity box ."],
-        ["./flickr_7520721.jpg", "A woman figure skater in a blue costume holds her leg by the blade of her skate ."]
     ],
     article=Path("docs/intro.md").read_text()
 ).launch()

 from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo
 # Use this command for evaluate the GLIP-T model
+#config_file = "configs/pretrain/glip_Swin_T_O365_GoldG.yaml"
 #weight_file = "MODEL/glip_tiny_model_o365_goldg_cc_sbu.pth"
+config_file = "configs/pretrain_new/desco_glip.yaml"
+weight_file = "MODEL/desco_glip_tiny.pth"
 # Use this command if you want to try the GLIP-L model
 # ! wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/models/glip_large_model.pth -O MODEL/glip_large_model.pth
 # config_file = "configs/pretrain/glip_Swin_L.yaml"
         ),
     ],
     examples=[
+        #["./flickr_9472793441.jpg", "bobble heads on top of the shelf ."],
+        #["./flickr_9472793441.jpg", "sofa . remote . dog . person . car . sky . plane ."],
         ["./coco_000000281759.jpg", "A green umbrella. A pink striped umbrella. A plain white umbrella."],
         ["./coco_000000281759.jpg", "a flowery top. A blue dress. An orange shirt ."],
         ["./coco_000000281759.jpg", "a car . An electricity box ."],
+        #["./flickr_7520721.jpg", "A woman figure skater in a blue costume holds her leg by the blade of her skate ."]
     ],
     article=Path("docs/intro.md").read_text()
 ).launch()

coco_000000281759.jpg ADDED Viewed

configs/flickr/test.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+MODEL:
+  ATSS:
+    NUM_CLASSES: 8 # Placeholder
+  FCOS:
+    NUM_CLASSES: 8 # Placeholder
+  ROI_BOX_HEAD:
+    NUM_CLASSES: 8 # Placeholder
+  DYHEAD:
+    NUM_CLASSES: 8 # Placeholder
+DATASETS:
+  TRAIN: ("flickr30k_test", )
+  TEST: ("flickr30k_test", )
+  FLICKR_GT_TYPE: "separate"
+INPUT:
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  ASPECT_RATIO_GROUPING: False

configs/flickr/val.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+MODEL:
+  ATSS:
+    NUM_CLASSES: 8 # Placeholder
+  FCOS:
+    NUM_CLASSES: 8 # Placeholder
+  ROI_BOX_HEAD:
+    NUM_CLASSES: 8 # Placeholder
+  DYHEAD:
+    NUM_CLASSES: 8 # Placeholder
+DATASETS:
+  TRAIN: ("flickr30k_val", )
+  TEST: ("flickr30k_val", )
+  FLICKR_GT_TYPE: "separate"
+INPUT:
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  ASPECT_RATIO_GROUPING: False

configs/lvis/minival.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+MODEL:
+  ATSS:
+    NUM_CLASSES: 8 # these fields are not used; just a placeholder
+  FCOS:
+    NUM_CLASSES: 8
+  ROI_BOX_HEAD:
+    NUM_CLASSES: 8
+  DYHEAD:
+    NUM_CLASSES: 8
+DATASETS:
+  REGISTER:
+    lvis_evaluation_mini_val:
+      img_dir: "coco"
+      ann_file: "coco/annotations/lvis_v1_minival_inserted_image_name.json"
+    lvis_evaluation_val:
+      img_dir: "coco"
+      ann_file: "coco/annotations/lvis_od_val.json"
+  TRAIN: ("lvis_evaluation_mini_val",)
+  TEST: ("lvis_evaluation_mini_val",)
+INPUT:
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  ASPECT_RATIO_GROUPING: False
+TEST:
+  IMS_PER_BATCH: 8

configs/omnilabel/omnilabel_val_eval.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+DATASETS:
+  REGISTER:
+    omnilabel_val_lvis_minival:
+      img_dir: "coco/"
+      ann_file: "coco/annotations/lvis_v1.description_omni.json"
+    omnilabel_val_lvis_selected:
+      img_dir: "coco/"
+      ann_file: "coco/annotations/lvis_v1.description_omni.selected.json"
+    omnilabel_val_lvis_auto:
+      img_dir: "coco/"
+      ann_file: "coco/annotations/lvis_v1.description_omni.auto.json"
+    omnilabel_val_flickr:
+      img_dir: "flickr30k/flickr30k_images/val/"
+      ann_file: "mdetr_annotations/final_flickr_separateGT_val.v1.25-0.omnilabel.json"
+  TEST: ("omnilabel_val",)
+  # TEST: ("omnilabel_val_coco",)
+DATALOADER:
+  ASPECT_RATIO_GROUPING: False

configs/pretrain/_coco.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+DATASETS:
+  TRAIN: ("coco_2017_train",)
+  TEST: ("coco_2017_val", )

configs/pretrain/fiber_cc.yaml ADDED Viewed

	@@ -0,0 +1,144 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "MODEL/swin_base_patch4_window7_224.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    FUSION_VERSION: "v2"
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+  SWINT:
+      VERSION: "fusion"
+      EMBED_DIM: 128
+      DEPTHS: (2, 2, 18, 2)
+      NUM_HEADS: (4, 8, 16, 32)
+      WINDOW_SIZE: 12
+      OUT_CHANNELS: (128, 256, 512, 1024)
+      DROP_PATH_RATE: 0.4
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "roberta-fused-v2"
+    MASK_SPECIAL: False
+    TOKENIZER_TYPE: "roberta-base"
+    USE_CHECKPOINT: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    USE_CHECKPOINT: True
+    FUSE_CONFIG:
+      USE_FUSED_FEATURES_DOT_PRODUCT: False
+      EARLY_FUSE_ON: False
+      TYPE: "NONE"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+DATASETS:
+  REGISTER:
+    bing_caption_train:
+      yaml_path: "GCC/CC3M/yamls"
+      yaml_name: "tiny.noun.harsh"
+      yaml_name_no_coco: "tiny.noun.harsh"
+  # PREDOWNLOAD_BING : True
+  # PREDOWNLOAD_WITH_AZCOPY : True
+  CAPTION_CONF: 0.4
+  CAPTION_AUGMENTATION_VERSION: "v3.v1"
+  CAPTION_VOCAB_FILE: "tools/files/mixed_vocab.v1.tmp0.davincci.chunk1of1.filtered.json"
+  DESCRIPTION_FILE: "tools/files/o365.description.v1.json"
+  TRAIN:  ("mixed_train_no_coco", "flickr30k_train", "object365_dt_train", "bing_caption_train_no_coco")
+  #  TRAIN: ("bing_caption_train", "mixed_train", "flickr30k_train", "coco_grounding_train", )
+  TEST: ("coco_2017_val", )
+  BING_INDEX_LIST: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
+  # BING_INDEX_LIST: [ 0, 1, ]
+  ONE_HOT: False
+  FLICKR_COPY: 2
+  MIXED_COPY: 2
+  OBJECT365_COPY: 2
+  DISABLE_SHUFFLE: False
+  ADD_DET_PROMPT: False
+  RANDOM_SAMPLE_NEG: 85
+  CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
+  FURTHER_SCREEN: True
+  CAPTION_NMS: -1.0
+  CAPTION_MIN_BOX: 1
+  SEPARATION_TOKENS: ". "
+  PACK_RANDOM_CAPTION_NUMBER: 20
+  NO_RANDOM_PACK_PROBABILITY: 0.4
+  RANDOM_PACK_PROB: 0.5
+  CAPTION_FORMAT_VERSION: "v2"
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  DISTRIBUTE_CHUNK_AMONG_NODE: False
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.01
+  WEIGHT_DECAY_SCHEDULE: True
+  STEPS: (0.67, 0.89)
+  MAX_ITER: 235026
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  TEST_WITH_INFERENCE: True
+  FIND_UNUSED_PARAMETERS: False
+  USE_AMP: True
+  MODEL_EMA: 0.999
+  CHECKPOINT_PERIOD: 2500
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0
+TEST:
+  DURING_TRAINING: False
+  IMS_PER_BATCH: 64

configs/pretrain/fiber_tiny.yaml ADDED Viewed

	@@ -0,0 +1,157 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    FUSION_VERSION: "v2"
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+  SWINT:
+      VERSION: "fusion"
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "roberta-fused-tiny"
+    MASK_SPECIAL: False
+    TOKENIZER_TYPE: "roberta-base"
+    USE_CHECKPOINT: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    USE_CHECKPOINT: False
+    FUSE_CONFIG:
+      USE_FUSED_FEATURES_DOT_PRODUCT: False
+      EARLY_FUSE_ON: False
+      TYPE: "NONE"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+DATASETS:
+  TRAIN: ("mixed_train_no_coco", "flickr30k_train", "object365_dt_train", )
+  TEST: ("coco_2017_val", )
+  ADD_DET_PROMPT: False
+  ADD_DET_PROMPT_ADVANCED: False
+  ALTERNATIVE_TRAINING: False
+  BOX_THRESHOLD: 0.1
+  CAPTION_CONF: 0.9
+  CAPTION_FORMAT_VERSION: "v2"
+  CAPTION_MIN_BOX: 1
+  CAPTION_NMS: 0.9
+  CLASS_AGNOSTIC: False
+  CLASS_CONCAT: False
+  COCO_COPY: 1
+  #CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
+  CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
+  DISABLE_CLIP_TO_IMAGE: False
+  DISABLE_SHUFFLE: False
+  FEW_SHOT: 0
+  FLICKR_COPY: 1
+  FLICKR_GT_TYPE: "separate"
+  FULL_QUESTION_PROB: 0.5
+  FURTHER_SCREEN: False
+  GENERAL_COPY: -1
+  GENERAL_COPY_TEST: -1
+  INFERENCE_CAPTION: False
+  IN_COPY: 1
+  LOCAL_DEBUG: False
+  LVIS_COPY: 1
+  LVIS_USE_NORMAL_AP: False
+  MAX_BOX: -1
+  MIXED_COPY: 1
+  MULTISTAGE_TRAINING: False
+  NEG_QUESTION_PROB: 0.8
+  NO_MINUS_ONE_FOR_ONE_HOT: False
+  OBJECT365_COPY: 1
+  OI_COPY: 1
+  ONE_HOT: False
+  PACK_RANDOM_CAPTION_NUMBER: 0
+  POS_QUESTION_PROB: 0.6
+  PREDOWNLOAD_BING: False
+  PREDOWNLOAD_WITH_AZCOPY: False
+  PROMPT_LIMIT_NEG: -1
+  RANDOM_SAMPLE_NEG: 85
+  REPLACE_CLEAN_LABEL: False
+  SAFEGUARD_POSITIVE_CAPTION: True
+  SEPARATION_TOKENS: ". "
+  SHUFFLE_SEED: 0
+  TEST_DATASETNAME_SUFFIX: ""
+  TRAIN_DATASETNAME_SUFFIX: ""
+  USE_CAPTION_PROMPT: False
+  USE_COCO_FORMAT: False
+  USE_CROWD: False
+  USE_OD_AUG: False
+  USE_OVERRIDE_CATEGORY: False
+  USE_SUPRESS_QUERY: False
+  VG_COPY: 1
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  DISTRIBUTE_CHUNK_AMONG_NODE: False
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.01
+  WEIGHT_DECAY_SCHEDULE: True
+  STEPS: (0.67, 0.89)
+  MAX_ITER: 800000
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  TEST_WITH_INFERENCE: True
+  FIND_UNUSED_PARAMETERS: True
+  USE_AMP: True
+  MODEL_EMA: 0.999
+  CHECKPOINT_PERIOD: 2500
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0
+TEST:
+  DURING_TRAINING: False
+  IMS_PER_BATCH: 64

configs/pretrain/fiber_tiny_lr.yaml ADDED Viewed

	@@ -0,0 +1,157 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    FUSION_VERSION: "v2"
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+  SWINT:
+      VERSION: "fusion"
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "roberta-fused-tiny"
+    MASK_SPECIAL: False
+    TOKENIZER_TYPE: "roberta-base"
+    USE_CHECKPOINT: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    USE_CHECKPOINT: False
+    FUSE_CONFIG:
+      USE_FUSED_FEATURES_DOT_PRODUCT: False
+      EARLY_FUSE_ON: False
+      TYPE: "NONE"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+DATASETS:
+  TRAIN: ("mixed_train_no_coco", "flickr30k_train", "object365_dt_train", )
+  TEST: ("coco_2017_val", )
+  ADD_DET_PROMPT: False
+  ADD_DET_PROMPT_ADVANCED: False
+  ALTERNATIVE_TRAINING: False
+  BOX_THRESHOLD: 0.1
+  CAPTION_CONF: 0.9
+  CAPTION_FORMAT_VERSION: "v2"
+  CAPTION_MIN_BOX: 1
+  CAPTION_NMS: 0.9
+  CLASS_AGNOSTIC: False
+  CLASS_CONCAT: False
+  COCO_COPY: 1
+  #CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
+  CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
+  DISABLE_CLIP_TO_IMAGE: False
+  DISABLE_SHUFFLE: False
+  FEW_SHOT: 0
+  FLICKR_COPY: 1
+  FLICKR_GT_TYPE: "separate"
+  FULL_QUESTION_PROB: 0.5
+  FURTHER_SCREEN: False
+  GENERAL_COPY: -1
+  GENERAL_COPY_TEST: -1
+  INFERENCE_CAPTION: False
+  IN_COPY: 1
+  LOCAL_DEBUG: False
+  LVIS_COPY: 1
+  LVIS_USE_NORMAL_AP: False
+  MAX_BOX: -1
+  MIXED_COPY: 1
+  MULTISTAGE_TRAINING: False
+  NEG_QUESTION_PROB: 0.8
+  NO_MINUS_ONE_FOR_ONE_HOT: False
+  OBJECT365_COPY: 1
+  OI_COPY: 1
+  ONE_HOT: False
+  PACK_RANDOM_CAPTION_NUMBER: 0
+  POS_QUESTION_PROB: 0.6
+  PREDOWNLOAD_BING: False
+  PREDOWNLOAD_WITH_AZCOPY: False
+  PROMPT_LIMIT_NEG: -1
+  RANDOM_SAMPLE_NEG: 85
+  REPLACE_CLEAN_LABEL: False
+  SAFEGUARD_POSITIVE_CAPTION: True
+  SEPARATION_TOKENS: ". "
+  SHUFFLE_SEED: 0
+  TEST_DATASETNAME_SUFFIX: ""
+  TRAIN_DATASETNAME_SUFFIX: ""
+  USE_CAPTION_PROMPT: False
+  USE_COCO_FORMAT: False
+  USE_CROWD: False
+  USE_OD_AUG: False
+  USE_OVERRIDE_CATEGORY: False
+  USE_SUPRESS_QUERY: False
+  VG_COPY: 1
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  DISTRIBUTE_CHUNK_AMONG_NODE: False
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.01
+  WEIGHT_DECAY_SCHEDULE: True
+  STEPS: (0.67, 0.89)
+  MAX_ITER: 800000
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  TEST_WITH_INFERENCE: True
+  FIND_UNUSED_PARAMETERS: True
+  USE_AMP: True
+  MODEL_EMA: 0.999
+  CHECKPOINT_PERIOD: 2500
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0
+TEST:
+  DURING_TRAINING: False
+  IMS_PER_BATCH: 64

configs/pretrain/fibert_flickr_only.yaml ADDED Viewed

	@@ -0,0 +1,157 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    FUSION_VERSION: "v2"
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+  SWINT:
+      VERSION: "fusion"
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "roberta-fused-tiny"
+    MASK_SPECIAL: False
+    TOKENIZER_TYPE: "roberta-base"
+    USE_CHECKPOINT: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    USE_CHECKPOINT: False
+    FUSE_CONFIG:
+      USE_FUSED_FEATURES_DOT_PRODUCT: False
+      EARLY_FUSE_ON: False
+      TYPE: "NONE"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+DATASETS:
+  TRAIN: ("flickr30k_train", )
+  TEST: ("coco_2017_val", )
+  ADD_DET_PROMPT: False
+  ADD_DET_PROMPT_ADVANCED: False
+  ALTERNATIVE_TRAINING: False
+  BOX_THRESHOLD: 0.1
+  CAPTION_CONF: 0.9
+  CAPTION_FORMAT_VERSION: "v2"
+  CAPTION_MIN_BOX: 1
+  CAPTION_NMS: 0.9
+  CLASS_AGNOSTIC: False
+  CLASS_CONCAT: False
+  COCO_COPY: 1
+  #CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
+  CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
+  DISABLE_CLIP_TO_IMAGE: False
+  DISABLE_SHUFFLE: False
+  FEW_SHOT: 0
+  FLICKR_COPY: 1
+  FLICKR_GT_TYPE: "separate"
+  FULL_QUESTION_PROB: 0.5
+  FURTHER_SCREEN: False
+  GENERAL_COPY: -1
+  GENERAL_COPY_TEST: -1
+  INFERENCE_CAPTION: False
+  IN_COPY: 1
+  LOCAL_DEBUG: False
+  LVIS_COPY: 1
+  LVIS_USE_NORMAL_AP: False
+  MAX_BOX: -1
+  MIXED_COPY: 1
+  MULTISTAGE_TRAINING: False
+  NEG_QUESTION_PROB: 0.8
+  NO_MINUS_ONE_FOR_ONE_HOT: False
+  OBJECT365_COPY: 1
+  OI_COPY: 1
+  ONE_HOT: False
+  PACK_RANDOM_CAPTION_NUMBER: 0
+  POS_QUESTION_PROB: 0.6
+  PREDOWNLOAD_BING: False
+  PREDOWNLOAD_WITH_AZCOPY: False
+  PROMPT_LIMIT_NEG: -1
+  RANDOM_SAMPLE_NEG: 85
+  REPLACE_CLEAN_LABEL: False
+  SAFEGUARD_POSITIVE_CAPTION: True
+  SEPARATION_TOKENS: ". "
+  SHUFFLE_SEED: 0
+  TEST_DATASETNAME_SUFFIX: ""
+  TRAIN_DATASETNAME_SUFFIX: ""
+  USE_CAPTION_PROMPT: False
+  USE_COCO_FORMAT: False
+  USE_CROWD: False
+  USE_OD_AUG: False
+  USE_OVERRIDE_CATEGORY: False
+  USE_SUPRESS_QUERY: False
+  VG_COPY: 1
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  DISTRIBUTE_CHUNK_AMONG_NODE: False
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.01
+  WEIGHT_DECAY_SCHEDULE: True
+  STEPS: (0.67, 0.89)
+  MAX_ITER: 800000
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  TEST_WITH_INFERENCE: True
+  FIND_UNUSED_PARAMETERS: True
+  USE_AMP: True
+  MODEL_EMA: 0.999
+  CHECKPOINT_PERIOD: 2500
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0
+TEST:
+  DURING_TRAINING: False
+  IMS_PER_BATCH: 64

configs/pretrain/glip_Swin_Flickr.yaml ADDED Viewed

	@@ -0,0 +1,116 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
+    MASK_SPECIAL: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    FUSE_CONFIG:
+      EARLY_FUSE_ON: True
+      TYPE: "MHA-B"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+# use for grounding model
+DATASETS:
+  REGISTER:
+    bing_caption_train:
+      yaml_path: "GCC/CC3M/yamls"
+      yaml_name: "tiny"
+      yaml_name_no_coco: "tiny"
+  # PREDOWNLOAD_BING : True
+  # PREDOWNLOAD_WITH_AZCOPY : True
+  TRAIN: ("flickr30k_train", )
+  #  TRAIN: ("bing_caption_train", "mixed_train", "flickr30k_train", "coco_grounding_train", )
+  TEST: ("coco_2017_val", )
+  # BING_INDEX_LIST: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
+  # BING_INDEX_LIST: [ 0, 1, ]
+  ONE_HOT: False
+  FLICKR_COPY: 1
+  MIXED_COPY: 1
+  OBJECT365_COPY: 1
+  DISABLE_SHUFFLE: False
+  ADD_DET_PROMPT: False
+  RANDOM_SAMPLE_NEG: 85
+  CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
+  FURTHER_SCREEN: True
+  CAPTION_CONF: 0.5
+  CAPTION_NMS: -1.0
+  CAPTION_MIN_BOX: 1
+  SEPARATION_TOKENS: ". "
+  PACK_RANDOM_CAPTION_NUMBER: 20
+  NO_RANDOM_PACK_PROBABILITY: 0.4
+  RANDOM_PACK_PROB: 0.5
+  CAPTION_FORMAT_VERSION: "v2"
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  DISTRIBUTE_CHUNK_AMONG_NODE: False
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.0001
+  STEPS: (0.67, 0.89)
+  MAX_EPOCH: 12
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0

configs/pretrain/glip_Swin_L.yaml ADDED Viewed

	@@ -0,0 +1,120 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "swin_large_patch4_window12_384_22k.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+  SWINT:
+    EMBED_DIM: 192
+    DEPTHS: (2, 2, 18, 2)
+    NUM_HEADS: (6, 12, 24, 48)
+    WINDOW_SIZE: 12
+    OUT_CHANNELS: (192, 384, 768, 1536)
+    DROP_PATH_RATE: 0.4
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
+    MASK_SPECIAL: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 8
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    USE_CHECKPOINT: True
+    FUSE_CONFIG:
+      USE_FUSED_FEATURES_DOT_PRODUCT: True
+      EARLY_FUSE_ON: True
+      TYPE: "MHA-B"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+DATASETS:
+  TRAIN: ("mixed_train_no_coco",) # Place holder dataset for now. To be updated in the next version
+  TEST: ("coco_2017_val", )
+  ONE_HOT: False
+  FLICKR_COPY: 8 # 0.15 * 8 = ~1.2M
+  MIXED_COPY: 4 # 0.6 * 4 = ~2.4M
+  OBJECT365_COPY: 2 # 1.4 * 2 = ~2.8M
+  VG_COPY: 3 # 0.4 * 3 = ~1.2M
+  IN_COPY: 2 # 0.67 * 2 = ~1.33M
+  OI_COPY: 1 # 2M * 1 = 2M
+  DISABLE_SHUFFLE: False
+  ADD_DET_PROMPT: False
+  RANDOM_SAMPLE_NEG: 85
+  CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
+  FURTHER_SCREEN: True
+  CAPTION_CONF: 0.5
+  CAPTION_NMS: -1.0
+  CAPTION_MIN_BOX: 1
+  SEPARATION_TOKENS: ". "
+  PACK_RANDOM_CAPTION_NUMBER: 20
+  NO_RANDOM_PACK_PROBABILITY: 0.4
+  RANDOM_PACK_PROB: 0.5
+  CAPTION_FORMAT_VERSION: "v2"
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.01
+  WEIGHT_DECAY_SCHEDULE: True
+  STEPS: (0.67, 0.89)
+  MAX_ITER: 1000000
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  FIND_UNUSED_PARAMETERS: False
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0

configs/pretrain/glip_Swin_T_O365.yaml ADDED Viewed

	@@ -0,0 +1,102 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "swin_tiny_patch4_window7_224.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+    FREEZE_CONV_BODY_AT: -1
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
+    MASK_SPECIAL: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    FUSE_CONFIG:
+      EARLY_FUSE_ON: True
+      TYPE: "MHA-B"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_FUSED_FEATURES_DOT_PRODUCT: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+    USE_CHECKPOINT: True
+TEST:
+  DURING_TRAINING: False
+  IMS_PER_BATCH: 64
+# use for grounding model
+DATASETS:
+  TRAIN: ("object365_dt_train", )
+  TEST: ("coco_2017_val", )
+  DISABLE_SHUFFLE: False
+  ADD_DET_PROMPT: False
+  RANDOM_SAMPLE_NEG: 85
+  CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
+  DESCRIPTION_FILE: "DATASET/Objects365/descriptions/o365.description.v1.json"
+  SEPARATION_TOKENS: ". "
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.0001
+  STEPS: (0.67, 0.89)
+  MAX_EPOCH: 30
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  USE_AMP: True
+  MODEL_EMA: 0.999
+  FIND_UNUSED_PARAMETERS: False
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0

configs/pretrain/glip_Swin_T_O365_GoldG.yaml ADDED Viewed

	@@ -0,0 +1,132 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "swin_tiny_patch4_window7_224.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+    FREEZE_CONV_BODY_AT: -1
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
+    MASK_SPECIAL: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    FUSE_CONFIG:
+      EARLY_FUSE_ON: True
+      TYPE: "MHA-B"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_FUSED_FEATURES_DOT_PRODUCT: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+    USE_CHECKPOINT: True
+TEST:
+  DURING_TRAINING: False
+  IMS_PER_BATCH: 64
+# use for grounding model
+DATASETS:
+  REGISTER:
+    mixed_train_no_coco_noun:
+      coco_img_dir: "coco/train2014"
+      vg_img_dir: "gqa/images"
+      ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns.json"
+    mixed_train_no_coco_gpt:
+      coco_img_dir: "coco/train2014"
+      vg_img_dir: "gqa/images"
+      ann_file: "mdetr_annotations/final_mixed_train_no_coco_gpt.v1.new.json"
+    flickr30k_train_gpt:
+      img_folder: "flickr30k/flickr30k_images/train"
+      ann_file: "mdetr_annotations/final_flickr_separateGT_train_gpt.v1.json"
+      is_train: True
+    mixed_train_no_coco_noun_gpt:
+      coco_img_dir: "coco/train2014"
+      vg_img_dir: "gqa/images"
+      ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.v1.json"
+    mixed_train_no_coco_noun_gpt_0422:
+      coco_img_dir: "coco/train2014"
+      vg_img_dir: "gqa/images"
+      ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.0422.json"
+    mixed_train_no_coco_noun_gpt_0425:
+      coco_img_dir: "coco/train2014"
+      vg_img_dir: "gqa/images"
+      ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.0425.json"
+    flickr30k_train_gpt_0425:
+      img_folder: "flickr30k/flickr30k_images/train"
+      ann_file: "mdetr_annotations/final_flickr_separateGT_train_gpt.0425.json"
+      is_train: True
+  TRAIN: ("object365_dt_train", "mixed_train_no_coco", "flickr30k_train", )
+  TEST: ("coco_2017_val", )
+  DISABLE_SHUFFLE: False
+  ADD_DET_PROMPT: False
+  RANDOM_SAMPLE_NEG: 85
+  CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
+  DESCRIPTION_FILE: "tools/files/o365.description.v1.json"
+  CAPTION_VOCAB_FILE: "tools/files/mixed_vocab.v1.tmp0.davincci.chunk1of1.json"
+  SEPARATION_TOKENS: ". "
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.0001
+  STEPS: (0.67, 0.89)
+  MAX_EPOCH: 30
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  USE_AMP: True
+  MODEL_EMA: 0.999
+  FIND_UNUSED_PARAMETERS: False
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0

configs/pretrain/glip_Swin_T_O365_GoldG_description.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "swin_tiny_patch4_window7_224.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+    FREEZE_CONV_BODY_AT: -1
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
+    MASK_SPECIAL: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    FUSE_CONFIG:
+      EARLY_FUSE_ON: True
+      TYPE: "MHA-B"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_FUSED_FEATURES_DOT_PRODUCT: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+    USE_CHECKPOINT: True
+TEST:
+  DURING_TRAINING: False
+  IMS_PER_BATCH: 64
+# use for grounding model
+DATASETS:
+  REGISTER:
+    mixed_train_no_coco_noun:
+      coco_img_dir: "coco/train2014"
+      vg_img_dir: "gqa/images"
+      ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns.json"
+  TRAIN: ("object365_dt_train", "mixed_train_no_coco", "flickr30k_train", )
+  TEST: ("coco_2017_val", )
+  DISABLE_SHUFFLE: False
+  ADD_DET_PROMPT: False
+  RANDOM_SAMPLE_NEG: 85
+  CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
+  OD_TO_GROUNDING_VERSION: "description.gpt.v2.allow_zero"
+  CAPTION_AUGMENTATION_VERSION: "v3.v1"
+  CAPTION_VOCAB_FILE: "tools/files/mixed_vocab.v1.tmp0.davincci.chunk1of1.json"
+  DESCRIPTION_FILE: "tools/files/o365.description.v1.json"
+  SEPARATION_TOKENS: ". "
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.0001
+  STEPS: (0.67, 0.89)
+  MAX_EPOCH: 30
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  USE_AMP: True
+  MODEL_EMA: 0.999
+  FIND_UNUSED_PARAMETERS: False
+  MAX_NEG_PER_BATCH: 1.0
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0

configs/pretrain/glip_Swin_T_cc.yaml ADDED Viewed

	@@ -0,0 +1,116 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
+    MASK_SPECIAL: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    FUSE_CONFIG:
+      EARLY_FUSE_ON: True
+      TYPE: "MHA-B"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+# use for grounding model
+DATASETS:
+  REGISTER:
+    bing_caption_train:
+      yaml_path: "GCC/CC3M/yamls"
+      yaml_name: "tiny"
+      yaml_name_no_coco: "tiny"
+  # PREDOWNLOAD_BING : True
+  # PREDOWNLOAD_WITH_AZCOPY : True
+  TRAIN: ("bing_caption_train_no_coco",)
+  #  TRAIN: ("bing_caption_train", "mixed_train", "flickr30k_train", "coco_grounding_train", )
+  TEST: ("coco_2017_val", )
+  BING_INDEX_LIST: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
+  # BING_INDEX_LIST: [ 0, 1, ]
+  ONE_HOT: False
+  FLICKR_COPY: 4
+  MIXED_COPY: 4
+  OBJECT365_COPY: 2
+  DISABLE_SHUFFLE: False
+  ADD_DET_PROMPT: False
+  RANDOM_SAMPLE_NEG: 85
+  CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
+  FURTHER_SCREEN: True
+  CAPTION_CONF: 0.5
+  CAPTION_NMS: -1.0
+  CAPTION_MIN_BOX: 1
+  SEPARATION_TOKENS: ". "
+  PACK_RANDOM_CAPTION_NUMBER: 20
+  NO_RANDOM_PACK_PROBABILITY: 0.4
+  RANDOM_PACK_PROB: 0.5
+  CAPTION_FORMAT_VERSION: "v2"
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  DISTRIBUTE_CHUNK_AMONG_NODE: False
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.0001
+  STEPS: (0.67, 0.89)
+  MAX_EPOCH: 12
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0

configs/pretrain/glip_Swin_T_cc_augv3.yaml ADDED Viewed

	@@ -0,0 +1,126 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
+    MASK_SPECIAL: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    FUSE_CONFIG:
+      EARLY_FUSE_ON: True
+      TYPE: "MHA-B"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_FUSED_FEATURES_DOT_PRODUCT: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+# use for grounding model
+DATASETS:
+  REGISTER:
+    bing_caption_train:
+      yaml_path: "GCC/CC3M/yamls"
+      yaml_name: "tiny.noun.harsh"
+      yaml_name_no_coco: "tiny.noun.harsh"
+  # PREDOWNLOAD_BING : True
+  # PREDOWNLOAD_WITH_AZCOPY : True
+  CAPTION_CONF: 0.4
+  CAPTION_AUGMENTATION_VERSION: "v3.v1"
+  CAPTION_VOCAB_FILE: "tools/files/mixed_vocab.v1.tmp0.davincci.chunk1of1.filtered.json"
+  DESCRIPTION_FILE: "tools/files/o365.description.v1.json"
+  TRAIN:  ("mixed_train_no_coco", "flickr30k_train", "object365_dt_train", "bing_caption_train_no_coco")
+  #  TRAIN: ("bing_caption_train", "mixed_train", "flickr30k_train", "coco_grounding_train", )
+  TEST: ("coco_2017_val", )
+  BING_INDEX_LIST: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
+  # BING_INDEX_LIST: [ 0, 1, ]
+  ONE_HOT: False
+  FLICKR_COPY: 2
+  MIXED_COPY: 2
+  OBJECT365_COPY: 2
+  DISABLE_SHUFFLE: False
+  ADD_DET_PROMPT: False
+  RANDOM_SAMPLE_NEG: 85
+  CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
+  FURTHER_SCREEN: True
+  CAPTION_NMS: -1.0
+  CAPTION_MIN_BOX: 1
+  SEPARATION_TOKENS: ". "
+  PACK_RANDOM_CAPTION_NUMBER: 20
+  NO_RANDOM_PACK_PROBABILITY: 0.4
+  RANDOM_PACK_PROB: 0.5
+  CAPTION_FORMAT_VERSION: "v2"
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  DISTRIBUTE_CHUNK_AMONG_NODE: False
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.0001
+  STEPS: (0.67, 0.89)
+  #MAX_EPOCH: 12
+  MAX_ITER: 235026
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  USE_AMP: True
+  MODEL_EMA: 0.999
+  FIND_UNUSED_PARAMETERS: False
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0

configs/pretrain/glip_Swin_T_coco.yaml ADDED Viewed

	@@ -0,0 +1,100 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "swin_tiny_patch4_window7_224.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+    FREEZE_CONV_BODY_AT: -1
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
+    MASK_SPECIAL: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    FUSE_CONFIG:
+      EARLY_FUSE_ON: True
+      TYPE: "MHA-B"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_FUSED_FEATURES_DOT_PRODUCT: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+    USE_CHECKPOINT: True
+TEST:
+  DURING_TRAINING: False
+  IMS_PER_BATCH: 64
+# use for grounding model
+DATASETS:
+  TRAIN: ("coco_2017_train", )
+  TEST: ("coco_2017_val", )
+  DISABLE_SHUFFLE: False
+  ADD_DET_PROMPT: False
+  RANDOM_SAMPLE_NEG: 85
+  CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
+  SEPARATION_TOKENS: ". "
+  DESCRIPTION_FILE: "DATASET/coco/annotations/coco.description.v1.json"
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.0001
+  STEPS: (0.67, 0.89)
+  MAX_EPOCH: 30
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  USE_AMP: True
+  MODEL_EMA: 0.999
+  FIND_UNUSED_PARAMETERS: False
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0

configs/pretrain/glip_Swing_T_flickr.yaml ADDED Viewed

	@@ -0,0 +1,116 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
+    MASK_SPECIAL: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    FUSE_CONFIG:
+      EARLY_FUSE_ON: True
+      TYPE: "MHA-B"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+# use for grounding model
+DATASETS:
+  REGISTER:
+    bing_caption_train:
+      yaml_path: "GCC/CC3M/yamls"
+      yaml_name: "tiny"
+      yaml_name_no_coco: "tiny"
+  # PREDOWNLOAD_BING : True
+  # PREDOWNLOAD_WITH_AZCOPY : True
+  TRAIN: ("mixed_train_no_coco", ) #"bing_caption_train_no_coco")
+  #  TRAIN: ("bing_caption_train", "mixed_train", "flickr30k_train", "coco_grounding_train", )
+  TEST: ("coco_2017_val", )
+  BING_INDEX_LIST: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
+  # BING_INDEX_LIST: [ 0, 1, ]
+  ONE_HOT: False
+  FLICKR_COPY: 4
+  MIXED_COPY: 4
+  OBJECT365_COPY: 2
+  DISABLE_SHUFFLE: False
+  ADD_DET_PROMPT: False
+  RANDOM_SAMPLE_NEG: 85
+  CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
+  FURTHER_SCREEN: True
+  CAPTION_CONF: 0.5
+  CAPTION_NMS: -1.0
+  CAPTION_MIN_BOX: 1
+  SEPARATION_TOKENS: ". "
+  PACK_RANDOM_CAPTION_NUMBER: 20
+  NO_RANDOM_PACK_PROBABILITY: 0.4
+  RANDOM_PACK_PROB: 0.5
+  CAPTION_FORMAT_VERSION: "v2"
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  DISTRIBUTE_CHUNK_AMONG_NODE: False
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.0001
+  STEPS: (0.67, 0.89)
+  MAX_EPOCH: 12
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0

configs/pretrain/glip_large.yaml ADDED Viewed

	@@ -0,0 +1,120 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "swin_large_patch4_window12_384_22k.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+  SWINT:
+    EMBED_DIM: 192
+    DEPTHS: (2, 2, 18, 2)
+    NUM_HEADS: (6, 12, 24, 48)
+    WINDOW_SIZE: 12
+    OUT_CHANNELS: (192, 384, 768, 1536)
+    DROP_PATH_RATE: 0.4
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
+    MASK_SPECIAL: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 8
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    USE_CHECKPOINT: True
+    FUSE_CONFIG:
+      USE_FUSED_FEATURES_DOT_PRODUCT: True
+      EARLY_FUSE_ON: True
+      TYPE: "MHA-B"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+DATASETS:
+  TRAIN: ("mixed_train_no_coco",) # Place holder dataset for now. To be updated in the next version
+  TEST: ("coco_2017_val", )
+  ONE_HOT: False
+  FLICKR_COPY: 8 # 0.15 * 8 = ~1.2M
+  MIXED_COPY: 4 # 0.6 * 4 = ~2.4M
+  OBJECT365_COPY: 2 # 1.4 * 2 = ~2.8M
+  VG_COPY: 3 # 0.4 * 3 = ~1.2M
+  IN_COPY: 2 # 0.67 * 2 = ~1.33M
+  OI_COPY: 1 # 2M * 1 = 2M
+  DISABLE_SHUFFLE: False
+  ADD_DET_PROMPT: False
+  RANDOM_SAMPLE_NEG: 85
+  CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
+  FURTHER_SCREEN: True
+  CAPTION_CONF: 0.5
+  CAPTION_NMS: -1.0
+  CAPTION_MIN_BOX: 1
+  SEPARATION_TOKENS: ". "
+  PACK_RANDOM_CAPTION_NUMBER: 20
+  NO_RANDOM_PACK_PROBABILITY: 0.4
+  RANDOM_PACK_PROB: 0.5
+  CAPTION_FORMAT_VERSION: "v2"
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.01
+  WEIGHT_DECAY_SCHEDULE: True
+  STEPS: (0.67, 0.89)
+  MAX_ITER: 1000000
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  FIND_UNUSED_PARAMETERS: False
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0

configs/pretrain/mixed_nococo_flickr_objects365.yaml ADDED Viewed

	@@ -0,0 +1,162 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "MODEL/swin_base_patch4_window7_224.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    FUSION_VERSION: "v2"
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+  SWINT:
+      VERSION: "fusion"
+      EMBED_DIM: 128
+      DEPTHS: (2, 2, 18, 2)
+      NUM_HEADS: (4, 8, 16, 32)
+      WINDOW_SIZE: 12
+      OUT_CHANNELS: (128, 256, 512, 1024)
+      DROP_PATH_RATE: 0.4
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "roberta-fused-v2"
+    MASK_SPECIAL: False
+    TOKENIZER_TYPE: "roberta-base"
+    USE_CHECKPOINT: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    USE_CHECKPOINT: True
+    FUSE_CONFIG:
+      USE_FUSED_FEATURES_DOT_PRODUCT: False
+      EARLY_FUSE_ON: False
+      TYPE: "NONE"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+DATASETS:
+  TRAIN: ("mixed_train_no_coco", "flickr30k_train", "object365_dt_train" )
+  TEST: ("coco_2017_val", )
+  ADD_DET_PROMPT: False
+  ADD_DET_PROMPT_ADVANCED: False
+  ALTERNATIVE_TRAINING: False
+  BOX_THRESHOLD: 0.1
+  CAPTION_CONF: 0.9
+  CAPTION_FORMAT_VERSION: "v2"
+  CAPTION_MIN_BOX: 1
+  CAPTION_NMS: 0.9
+  CLASS_AGNOSTIC: False
+  CLASS_CONCAT: False
+  COCO_COPY: 1
+  CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
+  DISABLE_CLIP_TO_IMAGE: False
+  DISABLE_SHUFFLE: False
+  FEW_SHOT: 0
+  FLICKR_COPY: 1
+  FLICKR_GT_TYPE: "separate"
+  FULL_QUESTION_PROB: 0.5
+  FURTHER_SCREEN: False
+  GENERAL_COPY: -1
+  GENERAL_COPY_TEST: -1
+  INFERENCE_CAPTION: False
+  IN_COPY: 1
+  LOCAL_DEBUG: False
+  LVIS_COPY: 1
+  LVIS_USE_NORMAL_AP: False
+  MAX_BOX: -1
+  MIXED_COPY: 1
+  MULTISTAGE_TRAINING: False
+  NEG_QUESTION_PROB: 0.8
+  NO_MINUS_ONE_FOR_ONE_HOT: False
+  OBJECT365_COPY: 1
+  OI_COPY: 1
+  ONE_HOT: False
+  PACK_RANDOM_CAPTION_NUMBER: 0
+  POS_QUESTION_PROB: 0.6
+  PREDOWNLOAD_BING: False
+  PREDOWNLOAD_WITH_AZCOPY: False
+  PROMPT_LIMIT_NEG: -1
+  RANDOM_SAMPLE_NEG: 85
+  REPLACE_CLEAN_LABEL: False
+  SAFEGUARD_POSITIVE_CAPTION: True
+  SEPARATION_TOKENS: ". "
+  SHUFFLE_SEED: 0
+  TEST_DATASETNAME_SUFFIX: ""
+  TRAIN_DATASETNAME_SUFFIX: ""
+  USE_CAPTION_PROMPT: False
+  USE_COCO_FORMAT: False
+  USE_CROWD: False
+  USE_OD_AUG: False
+  USE_OVERRIDE_CATEGORY: False
+  USE_SUPRESS_QUERY: False
+  VG_COPY: 1
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  DISTRIBUTE_CHUNK_AMONG_NODE: False
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.01
+  WEIGHT_DECAY_SCHEDULE: True
+  STEPS: (0.67, 0.89)
+  MAX_ITER: 800000
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  TEST_WITH_INFERENCE: True
+  FIND_UNUSED_PARAMETERS: False
+  USE_AMP: True
+  MODEL_EMA: 0.999
+  CHECKPOINT_PERIOD: 2500
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0
+TEST:
+  DURING_TRAINING: False
+  IMS_PER_BATCH: 64

configs/pretrain/mixed_nococo_flickr_objects365_refexpclean.yaml ADDED Viewed

	@@ -0,0 +1,162 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "swin_large_patch4_window12_384_22k.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    FUSION_VERSION: "v3"
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+  SWINT:
+      VERSION: "fusion"
+      EMBED_DIM: 128
+      DEPTHS: (2, 2, 18, 2)
+      NUM_HEADS: (4, 8, 16, 32)
+      WINDOW_SIZE: 12
+      OUT_CHANNELS: (128, 256, 512, 1024)
+      DROP_PATH_RATE: 0.4
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "roberta-fused-v2"
+    MASK_SPECIAL: False
+    TOKENIZER_TYPE: "roberta-base"
+    USE_CHECKPOINT: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    USE_CHECKPOINT: True
+    FUSE_CONFIG:
+      USE_FUSED_FEATURES_DOT_PRODUCT: False
+      EARLY_FUSE_ON: False
+      TYPE: "NONE"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+DATASETS:
+  TRAIN: ("mixed_train_no_coco", "flickr30k_train", "object365_dt_train" )
+  TEST: ("coco_2017_val", )
+  ADD_DET_PROMPT: False
+  ADD_DET_PROMPT_ADVANCED: False
+  ALTERNATIVE_TRAINING: False
+  BOX_THRESHOLD: 0.1
+  CAPTION_CONF: 0.9
+  CAPTION_FORMAT_VERSION: "v2"
+  CAPTION_MIN_BOX: 1
+  CAPTION_NMS: 0.9
+  CLASS_AGNOSTIC: False
+  CLASS_CONCAT: False
+  COCO_COPY: 1
+  CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
+  DISABLE_CLIP_TO_IMAGE: False
+  DISABLE_SHUFFLE: False
+  FEW_SHOT: 0
+  FLICKR_COPY: 1
+  FLICKR_GT_TYPE: "separate"
+  FULL_QUESTION_PROB: 0.5
+  FURTHER_SCREEN: False
+  GENERAL_COPY: -1
+  GENERAL_COPY_TEST: -1
+  INFERENCE_CAPTION: False
+  IN_COPY: 1
+  LOCAL_DEBUG: False
+  LVIS_COPY: 1
+  LVIS_USE_NORMAL_AP: False
+  MAX_BOX: -1
+  MIXED_COPY: 1
+  MULTISTAGE_TRAINING: False
+  NEG_QUESTION_PROB: 0.8
+  NO_MINUS_ONE_FOR_ONE_HOT: False
+  OBJECT365_COPY: 1
+  OI_COPY: 1
+  ONE_HOT: False
+  PACK_RANDOM_CAPTION_NUMBER: 0
+  POS_QUESTION_PROB: 0.6
+  PREDOWNLOAD_BING: False
+  PREDOWNLOAD_WITH_AZCOPY: False
+  PROMPT_LIMIT_NEG: -1
+  RANDOM_SAMPLE_NEG: 85
+  REPLACE_CLEAN_LABEL: False
+  SAFEGUARD_POSITIVE_CAPTION: True
+  SEPARATION_TOKENS: ". "
+  SHUFFLE_SEED: 0
+  TEST_DATASETNAME_SUFFIX: ""
+  TRAIN_DATASETNAME_SUFFIX: ""
+  USE_CAPTION_PROMPT: False
+  USE_COCO_FORMAT: False
+  USE_CROWD: False
+  USE_OD_AUG: False
+  USE_OVERRIDE_CATEGORY: False
+  USE_SUPRESS_QUERY: False
+  VG_COPY: 1
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  DISTRIBUTE_CHUNK_AMONG_NODE: False
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.01
+  WEIGHT_DECAY_SCHEDULE: True
+  STEPS: (0.67, 0.89)
+  MAX_ITER: 800000
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 5000
+  WARMUP_FACTOR: 0.001
+  TEST_WITH_INFERENCE: True
+  FIND_UNUSED_PARAMETERS: False
+  USE_AMP: True
+  MODEL_EMA: 0.999
+  CHECKPOINT_PERIOD: 2500
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0
+TEST:
+  DURING_TRAINING: False
+  IMS_PER_BATCH: 64

configs/pretrain_new/desco_fiber.yaml ADDED Viewed

	@@ -0,0 +1,168 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "MODEL/swin_base_patch4_window7_224.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    FUSION_VERSION: "v2"
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+  SWINT:
+      VERSION: "fusion"
+      EMBED_DIM: 128
+      DEPTHS: (2, 2, 18, 2)
+      NUM_HEADS: (4, 8, 16, 32)
+      WINDOW_SIZE: 12
+      OUT_CHANNELS: (128, 256, 512, 1024)
+      DROP_PATH_RATE: 0.4
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "roberta-fused-v2"
+    MASK_SPECIAL: False
+    TOKENIZER_TYPE: "roberta-base"
+    USE_CHECKPOINT: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    USE_CHECKPOINT: True
+    FUSE_CONFIG:
+      USE_FUSED_FEATURES_DOT_PRODUCT: False
+      EARLY_FUSE_ON: False
+      TYPE: "NONE"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+DATASETS:
+  REGISTER:
+    bing_caption_train:
+      yaml_path: "GCC/CC3M/yamls"
+      yaml_name: "tiny.noun.harsh"
+      yaml_name_no_coco: "tiny.noun.harsh"
+    mixed_train_no_coco_noun:
+      coco_img_dir: "coco/train2014"
+      vg_img_dir: "gqa/images"
+      ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns.json"
+    mixed_train_no_coco_gpt:
+      coco_img_dir: "coco/train2014"
+      vg_img_dir: "gqa/images"
+      ann_file: "mdetr_annotations/final_mixed_train_no_coco_gpt.v1.new.json"
+    flickr30k_train_gpt:
+      img_folder: "flickr30k/flickr30k_images/train"
+      ann_file: "mdetr_annotations/final_flickr_separateGT_train_gpt.v1.json"
+      is_train: True
+    mixed_train_no_coco_noun_gpt:
+      coco_img_dir: "coco/train2014"
+      vg_img_dir: "gqa/images"
+      ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.v1.json"
+    mixed_train_no_coco_noun_gpt_0425:
+      coco_img_dir: "coco/train2014"
+      vg_img_dir: "gqa/images"
+      ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.0425.json"
+    flickr30k_train_gpt_0425:
+      img_folder: "flickr30k/flickr30k_images/train"
+      ann_file: "mdetr_annotations/final_flickr_separateGT_train_gpt.0425.json"
+      is_train: True
+  CAPTION_CONF: 0.4
+  OD_TO_GROUNDING_VERSION: "description.gpt.v10.mixed.allow_zero.v1"
+  CAPTION_AUGMENTATION_VERSION: "mixed.v4.8-2.drop_positive.control_pos.grouping.v1"
+  CC_CAPTION_AUGMENTATION_VERSION: "mixed.v3-v4.9-1.drop_positive.control_pos.grouping.v1"
+  CAPTION_VOCAB_FILE: "tools/files/joint_vocab.merged.v1.tmp0.davincci.json"
+  DESCRIPTION_FILE: "tools/files/o365.description.v1.json"
+  TRAIN:  ("mixed_train_no_coco_noun_gpt_0425", "flickr30k_train_gpt_0425", "object365_dt_train", ) # bing_caption_train_no_coco
+  TEST: ("coco_2017_val", )
+  ADD_DET_PROMPT: False
+  ADD_DET_PROMPT_ADVANCED: False
+  ALTERNATIVE_TRAINING: False
+  BING_INDEX_LIST: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
+  ONE_HOT: False
+  FLICKR_COPY: 2
+  MIXED_COPY: 2
+  OBJECT365_COPY: 2
+  DISABLE_SHUFFLE: False
+  ADD_DET_PROMPT: False
+  RANDOM_SAMPLE_NEG: 85
+  CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
+  FURTHER_SCREEN: True
+  CAPTION_NMS: -1.0
+  CAPTION_MIN_BOX: 1
+  SEPARATION_TOKENS: ". "
+  PACK_RANDOM_CAPTION_NUMBER: 20
+  NO_RANDOM_PACK_PROBABILITY: 0.4
+  RANDOM_PACK_PROB: 0.5
+  CAPTION_FORMAT_VERSION: "v2"
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  DISTRIBUTE_CHUNK_AMONG_NODE: False
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.01
+  WEIGHT_DECAY_SCHEDULE: True
+  STEPS: (0.67, 0.89)
+  MAX_ITER: 800000
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  TEST_WITH_INFERENCE: True
+  FIND_UNUSED_PARAMETERS: False
+  USE_AMP: True
+  MODEL_EMA: 0.999
+  CHECKPOINT_PERIOD: 2500
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0
+TEST:
+  DURING_TRAINING: False
+  IMS_PER_BATCH: 64

configs/pretrain_new/desco_glip.yaml ADDED Viewed

	@@ -0,0 +1,134 @@

+# for final GLIP tiny, pre-trained from scratch
+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  BACKBONE:
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
+    MASK_SPECIAL: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9 # topk for selecting candidate positive samples from each level
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    USE_CHECKPOINT: True
+    FUSE_CONFIG:
+      EARLY_FUSE_ON: True
+      TYPE: "MHA-B"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_FUSED_FEATURES_DOT_PRODUCT: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+# use for grounding model
+DATASETS:
+  REGISTER:
+    bing_caption_train:
+      yaml_path: "GCC/CC3M/yamls"
+      yaml_name: "tiny.noun.harsh"
+      yaml_name_no_coco: "tiny.noun.harsh"
+    mixed_train_no_coco_noun_gpt_0425:
+      coco_img_dir: "coco/train2014"
+      vg_img_dir: "gqa/images"
+      ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.0425.json"
+    flickr30k_train_gpt_0425:
+      img_folder: "flickr30k/flickr30k_images/train"
+      ann_file: "mdetr_annotations/final_flickr_separateGT_train_gpt.0425.json"
+      is_train: True
+  CAPTION_CONF: 0.4
+  CAPTION_AUGMENTATION_VERSION: "mixed.v4-v3.5-4-1.drop_positive.control_pos.grouping.v1" # for GoldG data; used by CaptionAugmentation to determine how to perform the augmentation
+  OD_TO_GROUNDING_VERSION: "description.gpt.v10.mixed.allow_zero.v1" # for
+  CC_CAPTION_AUGMENTATION_VERSION: "mixed.v3.8-2.drop_positive.control_pos.grouping.v1" # for CC data; used by CaptionAugmentation to determine how to perform the augmentation
+  CAPTION_VOCAB_FILE: "tools/files/mixed_vocab.v1.tmp0.davincci.chunk1of1.filtered.json"
+  DESCRIPTION_FILE: "tools/files/o365.description.v1.json"
+  TRAIN:  ("mixed_train_no_coco_noun_gpt_0425", "flickr30k_train_gpt_0425", "object365_dt_train", ) # bing_caption_train_no_coco
+  TEST: ("coco_2017_val", )
+  BING_INDEX_LIST: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
+  # BING_INDEX_LIST: [ 0, 1, ]
+  ONE_HOT: False
+  FLICKR_COPY: 2
+  MIXED_COPY: 2
+  OBJECT365_COPY: 1
+  DISABLE_SHUFFLE: False
+  ADD_DET_PROMPT: False
+  RANDOM_SAMPLE_NEG: 85
+  CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
+  FURTHER_SCREEN: True
+  CAPTION_NMS: -1.0
+  CAPTION_MIN_BOX: 1
+  SEPARATION_TOKENS: ". "
+  PACK_RANDOM_CAPTION_NUMBER: 20
+  NO_RANDOM_PACK_PROBABILITY: 0.4
+  RANDOM_PACK_PROB: 0.5
+  CAPTION_FORMAT_VERSION: "v2"
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  DISTRIBUTE_CHUNK_AMONG_NODE: False
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.0001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.0001
+  STEPS: (0.67, 0.89)
+  #MAX_EPOCH: 12
+  MAX_ITER: 300000
+  IMS_PER_BATCH: 64
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  USE_AMP: True
+  MODEL_EMA: 0.999
+  FIND_UNUSED_PARAMETERS: False
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0

configs/refcoco.yaml ADDED Viewed

	@@ -0,0 +1,116 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "swin_base_patch4_window12_384_22k.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  ATSS:
+    PRE_NMS_TOP_N: 3000
+    DETECTIONS_PER_IMG: 100
+    INFERENCE_TH: 0.0
+  SWINT:
+    VERSION: "fusion"
+    EMBED_DIM: 128
+    DEPTHS: (2, 2, 18, 2)
+    NUM_HEADS: (4, 8, 16, 32)
+    WINDOW_SIZE: 12
+    OUT_CHANNELS: (128, 256, 512, 1024)
+    DROP_PATH_RATE: 0.4
+  BACKBONE:
+    FUSION_VERSION: "v3"
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+    USE_CHECKPOINT: True
+    FREEZE_CONV_BODY_AT: -1
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "roberta-fused-v2"
+    TOKENIZER_TYPE: "roberta-base"
+    LANG_DIM: 768
+    MASK_SPECIAL: False
+    USE_CHECKPOINT: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    USE_CHECKPOINT: True
+    FUSE_CONFIG:
+      EARLY_FUSE_ON: False
+      TYPE: "NONE"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+# use for grounding model
+DATASETS:
+  TRAIN: ("refcoco_train", )
+  TEST: ("refcoco_val", )
+  DISABLE_SHUFFLE: True
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+  FLIP_PROB_TRAIN: 0.0 # Important for refcoco esp
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.00001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.0001
+  STEPS: (0.67, 0.89)
+  MAX_EPOCH: 20
+  IMS_PER_BATCH: 16
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  TEST_WITH_INFERENCE: True
+  FIND_UNUSED_PARAMETERS: False
+  USE_AMP: True
+  MODEL_EMA: 0.999
+  CLIP_GRADIENTS:
+    ENABLED: False
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0
+TEST:
+  DURING_TRAINING: True
+  EVAL_TASK: "grounding"
+  IMS_PER_BATCH: 16

configs/refcocog.yaml ADDED Viewed

	@@ -0,0 +1,116 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "swin_base_patch4_window12_384_22k.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  ATSS:
+    PRE_NMS_TOP_N: 3000
+    DETECTIONS_PER_IMG: 100
+    INFERENCE_TH: 0.0
+  SWINT:
+    VERSION: "fusion"
+    EMBED_DIM: 128
+    DEPTHS: (2, 2, 18, 2)
+    NUM_HEADS: (4, 8, 16, 32)
+    WINDOW_SIZE: 12
+    OUT_CHANNELS: (128, 256, 512, 1024)
+    DROP_PATH_RATE: 0.4
+  BACKBONE:
+    FUSION_VERSION: "v3"
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+    USE_CHECKPOINT: True
+    FREEZE_CONV_BODY_AT: -1
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "roberta-fused-v2"
+    TOKENIZER_TYPE: "roberta-base"
+    LANG_DIM: 768
+    MASK_SPECIAL: False
+    USE_CHECKPOINT: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    USE_CHECKPOINT: True
+    FUSE_CONFIG:
+      EARLY_FUSE_ON: False
+      TYPE: "NONE"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+# use for grounding model
+DATASETS:
+  TRAIN: ("refcocog_train", )
+  TEST: ("refcocog_val",)
+  DISABLE_SHUFFLE: True
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+  FLIP_PROB_TRAIN: 0.0 # Important for refcoco esp
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.00001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.0001
+  STEPS: (0.67, 0.89)
+  MAX_EPOCH: 20
+  IMS_PER_BATCH: 16
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  TEST_WITH_INFERENCE: True
+  FIND_UNUSED_PARAMETERS: False
+  USE_AMP: True
+  MODEL_EMA: 0.999
+  CLIP_GRADIENTS:
+    ENABLED: False
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0
+TEST:
+  DURING_TRAINING: True
+  EVAL_TASK: "grounding"
+  IMS_PER_BATCH: 16

configs/refcocoplus.yaml ADDED Viewed

	@@ -0,0 +1,116 @@

+MODEL:
+  META_ARCHITECTURE: "GeneralizedVLRCNN"
+  WEIGHT: "swin_base_patch4_window12_384_22k.pth"
+  RPN_ONLY: True
+  RPN_ARCHITECTURE: "VLDYHEAD"
+  ATSS:
+    PRE_NMS_TOP_N: 3000
+    DETECTIONS_PER_IMG: 100
+    INFERENCE_TH: 0.0
+  SWINT:
+    VERSION: "fusion"
+    EMBED_DIM: 128
+    DEPTHS: (2, 2, 18, 2)
+    NUM_HEADS: (4, 8, 16, 32)
+    WINDOW_SIZE: 12
+    OUT_CHANNELS: (128, 256, 512, 1024)
+    DROP_PATH_RATE: 0.4
+  BACKBONE:
+    FUSION_VERSION: "v3"
+    CONV_BODY: "SWINT-FPN-RETINANET"
+    OUT_CHANNELS: 256
+    USE_CHECKPOINT: True
+    FREEZE_CONV_BODY_AT: -1
+  LANGUAGE_BACKBONE:
+    FREEZE: False
+    MODEL_TYPE: "roberta-fused-v2"
+    TOKENIZER_TYPE: "roberta-base"
+    LANG_DIM: 768
+    MASK_SPECIAL: False
+    USE_CHECKPOINT: False
+  RPN:
+    USE_FPN: True
+    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
+    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
+    ASPECT_RATIOS: (1.0,)
+    SCALES_PER_OCTAVE: 1
+  DYHEAD:
+    CHANNELS: 256
+    NUM_CONVS: 6
+    USE_GN: True
+    USE_DYRELU: True
+    USE_DFCONV: True
+    USE_DYFUSE: True
+    TOPK: 9
+    SCORE_AGG: "MEAN"
+    LOG_SCALE: 0.0
+    USE_CHECKPOINT: True
+    FUSE_CONFIG:
+      EARLY_FUSE_ON: False
+      TYPE: "NONE"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
+      USE_CLASSIFICATION_LOSS: False
+      USE_TOKEN_LOSS: False
+      USE_CONTRASTIVE_ALIGN_LOSS: False
+      CONTRASTIVE_HIDDEN_DIM: 64
+      USE_DOT_PRODUCT_TOKEN_LOSS: True
+      USE_LAYER_SCALE: True
+      CLAMP_MIN_FOR_UNDERFLOW: True
+      CLAMP_MAX_FOR_OVERFLOW: True
+      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
+      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
+      CLAMP_DOT_PRODUCT: True
+# use for grounding model
+DATASETS:
+  TRAIN: ("refcoco+_train", )
+  TEST: ("refcoco+_val",)
+  DISABLE_SHUFFLE: True
+INPUT:
+  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
+  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+AUGMENT:
+  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
+  FLIP_PROB_TRAIN: 0.0 # Important for refcoco esp
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  OPTIMIZER: ADAMW
+  BASE_LR: 0.00001
+  LANG_LR: 0.00001
+  WEIGHT_DECAY: 0.0001
+  STEPS: (0.67, 0.89)
+  MAX_EPOCH: 20
+  IMS_PER_BATCH: 16
+  WARMUP_ITERS: 2000
+  WARMUP_FACTOR: 0.001
+  TEST_WITH_INFERENCE: True
+  FIND_UNUSED_PARAMETERS: False
+  USE_AMP: True
+  MODEL_EMA: 0.999
+  CLIP_GRADIENTS:
+    ENABLED: False
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 1.0
+    NORM_TYPE: 2.0
+TEST:
+  DURING_TRAINING: True
+  EVAL_TASK: "grounding"
+  IMS_PER_BATCH: 16

configs/refexp/_refcoco+_testA.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+MODEL:
+  ATSS:
+    NUM_CLASSES: 8
+  FCOS:
+    NUM_CLASSES: 8
+  ROI_BOX_HEAD:
+    NUM_CLASSES: 8
+  DYHEAD:
+    NUM_CLASSES: 8
+DATASETS:
+  TEST: ("refcoco+_testA", )
+  FLICKR_GT_TYPE: "separate"
+INPUT:
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  ASPECT_RATIO_GROUPING: False
+SOLVER:
+  WARMUP_ITERS: 0
+  MAX_EPOCH: 12
+  CHECKPOINT_PERIOD: 100
+TEST:
+  IMS_PER_BATCH: 8
+# local debug command: CUDA_VISIBLE_DEVICES=0 python tools/finetune.py --config-file configs/harold/dyhead_grounding.yaml --ft-tasks configs/odinw/_flickr.yaml --skip-train SOLVER.IMS_PER_BATCH 1 MODEL.WEIGHT OUTPUT/model_0345000.pth  OUTPUT_DIR tmp  TEST.IMS_PER_BATCH 1  TEST.EVAL_TASK grounding TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM 100

configs/refexp/_refcoco+_testB.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+MODEL:
+  ATSS:
+    NUM_CLASSES: 8
+  FCOS:
+    NUM_CLASSES: 8
+  ROI_BOX_HEAD:
+    NUM_CLASSES: 8
+  DYHEAD:
+    NUM_CLASSES: 8
+DATASETS:
+  TEST: ("refcoco+_testB", )
+  FLICKR_GT_TYPE: "separate"
+INPUT:
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  ASPECT_RATIO_GROUPING: False
+SOLVER:
+  WARMUP_ITERS: 0
+  MAX_EPOCH: 12
+  CHECKPOINT_PERIOD: 100
+TEST:
+  IMS_PER_BATCH: 8
+# local debug command: CUDA_VISIBLE_DEVICES=0 python tools/finetune.py --config-file configs/harold/dyhead_grounding.yaml --ft-tasks configs/odinw/_flickr.yaml --skip-train SOLVER.IMS_PER_BATCH 1 MODEL.WEIGHT OUTPUT/model_0345000.pth  OUTPUT_DIR tmp  TEST.IMS_PER_BATCH 1  TEST.EVAL_TASK grounding TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM 100

configs/refexp/_refcoco_testA.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+MODEL:
+  ATSS:
+    NUM_CLASSES: 8
+  FCOS:
+    NUM_CLASSES: 8
+  ROI_BOX_HEAD:
+    NUM_CLASSES: 8
+  DYHEAD:
+    NUM_CLASSES: 8
+DATASETS:
+  TEST: ("refcoco_testA", )
+  FLICKR_GT_TYPE: "separate"
+INPUT:
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  ASPECT_RATIO_GROUPING: False
+SOLVER:
+  WARMUP_ITERS: 0
+  MAX_EPOCH: 12
+  CHECKPOINT_PERIOD: 100
+TEST:
+  IMS_PER_BATCH: 8
+# local debug command: CUDA_VISIBLE_DEVICES=0 python tools/finetune.py --config-file configs/harold/dyhead_grounding.yaml --ft-tasks configs/odinw/_flickr.yaml --skip-train SOLVER.IMS_PER_BATCH 1 MODEL.WEIGHT OUTPUT/model_0345000.pth  OUTPUT_DIR tmp  TEST.IMS_PER_BATCH 1  TEST.EVAL_TASK grounding TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM 100

configs/refexp/_refcoco_testB.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+MODEL:
+  ATSS:
+    NUM_CLASSES: 8
+  FCOS:
+    NUM_CLASSES: 8
+  ROI_BOX_HEAD:
+    NUM_CLASSES: 8
+  DYHEAD:
+    NUM_CLASSES: 8
+DATASETS:
+  TEST: ("refcoco_testB", )
+  FLICKR_GT_TYPE: "separate"
+INPUT:
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  ASPECT_RATIO_GROUPING: False
+SOLVER:
+  WARMUP_ITERS: 0
+  MAX_EPOCH: 12
+  CHECKPOINT_PERIOD: 100
+TEST:
+  IMS_PER_BATCH: 8
+# local debug command: CUDA_VISIBLE_DEVICES=0 python tools/finetune.py --config-file configs/harold/dyhead_grounding.yaml --ft-tasks configs/odinw/_flickr.yaml --skip-train SOLVER.IMS_PER_BATCH 1 MODEL.WEIGHT OUTPUT/model_0345000.pth  OUTPUT_DIR tmp  TEST.IMS_PER_BATCH 1  TEST.EVAL_TASK grounding TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM 100

configs/refexp/_refcocog_test.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+MODEL:
+  ATSS:
+    NUM_CLASSES: 8
+  FCOS:
+    NUM_CLASSES: 8
+  ROI_BOX_HEAD:
+    NUM_CLASSES: 8
+  DYHEAD:
+    NUM_CLASSES: 8
+DATASETS:
+  TEST: ("refcocog_test", )
+  FLICKR_GT_TYPE: "separate"
+INPUT:
+  MIN_SIZE_TRAIN: 800
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+  ASPECT_RATIO_GROUPING: False
+SOLVER:
+  WARMUP_ITERS: 0
+  MAX_EPOCH: 12
+  CHECKPOINT_PERIOD: 100
+TEST:
+  IMS_PER_BATCH: 8
+# local debug command: CUDA_VISIBLE_DEVICES=0 python tools/finetune.py --config-file configs/harold/dyhead_grounding.yaml --ft-tasks configs/odinw/_flickr.yaml --skip-train SOLVER.IMS_PER_BATCH 1 MODEL.WEIGHT OUTPUT/model_0345000.pth  OUTPUT_DIR tmp  TEST.IMS_PER_BATCH 1  TEST.EVAL_TASK grounding TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM 100

docs/intro.md ADDED Viewed

	@@ -0,0 +1,287 @@

+<!DOCTYPE html>
+<html class="">
+	<head>
+		<meta charset="utf-8" />
+		<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no" />
+		<meta name="description" content="We’re on a journey to advance and democratize artificial intelligence through open source and open science." />
+		<meta property="fb:app_id" content="1321688464574422" />
+		<meta name="twitter:card" content="summary_large_image" />
+		<meta name="twitter:site" content="@huggingface" />
+		<meta property="og:title" content="docs/intro.md · haotiz/glip-zeroshot-demo at main" />
+		<meta property="og:type" content="website" />
+		<meta property="og:url" content="https://huggingface.co/spaces/haotiz/glip-zeroshot-demo/blob/main/docs/intro.md" />
+		<meta property="og:image" content="https://cdn-thumbnails.huggingface.co/social-thumbnails/spaces/haotiz/glip-zeroshot-demo.png" />
+		<link rel="stylesheet" href="/front/build/kube-745aab2/style.css" />
+		<link rel="preconnect" href="https://fonts.gstatic.com" />
+		<link
+			href="https://fonts.googleapis.com/css2?family=Source+Sans+Pro:ital,wght@0,200;0,300;0,400;0,600;0,700;0,900;1,200;1,300;1,400;1,600;1,700;1,900&display=swap"
+			rel="stylesheet"
+		/>
+		<link
+			href="https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600;700&display=swap"
+			rel="stylesheet"
+		/>
+		<link
+			rel="preload"
+			href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.12.0/katex.min.css"
+			as="style"
+			onload="this.onload=null;this.rel='stylesheet'"
+		/>
+		<noscript>
+			<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.12.0/katex.min.css" />
+		</noscript>
+		<title>docs/intro.md · haotiz/glip-zeroshot-demo at main</title>
+		<script defer data-domain="huggingface.co" src="/js/script.js"></script>
+		<script type="text/javascript" src="https://de5282c3ca0c.edge.sdk.awswaf.com/de5282c3ca0c/526cf06acb0d/challenge.js" defer></script>
+	</head>
+	<body class="flex flex-col min-h-screen bg-white dark:bg-gray-950 text-black ViewerBlobPage">
+		<div class="flex min-h-screen flex-col">
+	<div class="SVELTE_HYDRATER contents" data-props="{&quot;classNames&quot;:&quot;&quot;,&quot;isWide&quot;:false,&quot;isZh&quot;:false}" data-target="MainHeader"><header class="border-b border-gray-100 "><div class="w-full px-4 container flex h-16 items-center"><div class="flex flex-1 items-center"><a class="mr-5 flex flex-none items-center lg:mr-6" href="/"><img alt="Hugging Face's logo" class="w-7 md:mr-2" src="/front/assets/huggingface_logo-noborder.svg">
+				<span class="hidden whitespace-nowrap text-lg font-bold md:block">Hugging Face</span></a>
+			<div class="relative flex-1 lg:max-w-sm mr-2 sm:mr-4 lg:mr-6"><input autocomplete="off" class="w-full dark:bg-gray-950 pl-8 form-input-alt h-9 pr-3 focus:shadow-xl" name="" placeholder="Search models, datasets, users..."  spellcheck="false" type="text" value="">
+	<svg class="absolute left-2.5 text-gray-400 top-1/2 transform -translate-y-1/2" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M30 28.59L22.45 21A11 11 0 1 0 21 22.45L28.59 30zM5 14a9 9 0 1 1 9 9a9 9 0 0 1-9-9z" fill="currentColor"></path></svg>
+	</div>
+			<div class="flex flex-none items-center justify-center p-0.5 place-self-stretch lg:hidden"><button class="relative z-40 flex h-6 w-8 items-center justify-center" type="button"><svg width="1em" height="1em" viewBox="0 0 10 10" class="text-xl" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" preserveAspectRatio="xMidYMid meet" fill="currentColor"><path fill-rule="evenodd" clip-rule="evenodd" d="M1.65039 2.9999C1.65039 2.8066 1.80709 2.6499 2.00039 2.6499H8.00039C8.19369 2.6499 8.35039 2.8066 8.35039 2.9999C8.35039 3.1932 8.19369 3.3499 8.00039 3.3499H2.00039C1.80709 3.3499 1.65039 3.1932 1.65039 2.9999ZM1.65039 4.9999C1.65039 4.8066 1.80709 4.6499 2.00039 4.6499H8.00039C8.19369 4.6499 8.35039 4.8066 8.35039 4.9999C8.35039 5.1932 8.19369 5.3499 8.00039 5.3499H2.00039C1.80709 5.3499 1.65039 5.1932 1.65039 4.9999ZM2.00039 6.6499C1.80709 6.6499 1.65039 6.8066 1.65039 6.9999C1.65039 7.1932 1.80709 7.3499 2.00039 7.3499H8.00039C8.19369 7.3499 8.35039 7.1932 8.35039 6.9999C8.35039 6.8066 8.19369 6.6499 8.00039 6.6499H2.00039Z"></path></svg>
+		</button>
+	</div></div>
+		<nav aria-label="Main" class="ml-auto hidden lg:block"><ul class="flex items-center space-x-2"><li><a class="group flex items-center px-2 py-0.5 dark:hover:text-gray-400 hover:text-indigo-700" href="/models"><svg class="mr-1.5 text-gray-400 group-hover:text-indigo-500" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><path class="uim-quaternary" d="M20.23 7.24L12 12L3.77 7.24a1.98 1.98 0 0 1 .7-.71L11 2.76c.62-.35 1.38-.35 2 0l6.53 3.77c.29.173.531.418.7.71z" opacity=".25" fill="currentColor"></path><path class="uim-tertiary" d="M12 12v9.5a2.09 2.09 0 0 1-.91-.21L4.5 17.48a2.003 2.003 0 0 1-1-1.73v-7.5a2.06 2.06 0 0 1 .27-1.01L12 12z" opacity=".5" fill="currentColor"></path><path class="uim-primary" d="M20.5 8.25v7.5a2.003 2.003 0 0 1-1 1.73l-6.62 3.82c-.275.13-.576.198-.88.2V12l8.23-4.76c.175.308.268.656.27 1.01z" fill="currentColor"></path></svg>
+					Models</a>
+			</li><li><a class="group flex items-center px-2 py-0.5 dark:hover:text-gray-400 hover:text-red-700" href="/datasets"><svg class="mr-1.5 text-gray-400 group-hover:text-red-500" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 25 25"><ellipse cx="12.5" cy="5" fill="currentColor" fill-opacity="0.25" rx="7.5" ry="2"></ellipse><path d="M12.5 15C16.6421 15 20 14.1046 20 13V20C20 21.1046 16.6421 22 12.5 22C8.35786 22 5 21.1046 5 20V13C5 14.1046 8.35786 15 12.5 15Z" fill="currentColor" opacity="0.5"></path><path d="M12.5 7C16.6421 7 20 6.10457 20 5V11.5C20 12.6046 16.6421 13.5 12.5 13.5C8.35786 13.5 5 12.6046 5 11.5V5C5 6.10457 8.35786 7 12.5 7Z" fill="currentColor" opacity="0.5"></path><path d="M5.23628 12C5.08204 12.1598 5 12.8273 5 13C5 14.1046 8.35786 15 12.5 15C16.6421 15 20 14.1046 20 13C20 12.8273 19.918 12.1598 19.7637 12C18.9311 12.8626 15.9947 13.5 12.5 13.5C9.0053 13.5 6.06886 12.8626 5.23628 12Z" fill="currentColor"></path></svg>
+					Datasets</a>
+			</li><li><a class="group flex items-center px-2 py-0.5 dark:hover:text-gray-400 hover:text-blue-700" href="/spaces"><svg class="mr-1.5 text-gray-400 group-hover:text-blue-500" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" viewBox="0 0 25 25"><path opacity=".5" d="M6.016 14.674v4.31h4.31v-4.31h-4.31ZM14.674 14.674v4.31h4.31v-4.31h-4.31ZM6.016 6.016v4.31h4.31v-4.31h-4.31Z" fill="currentColor"></path><path opacity=".75" fill-rule="evenodd" clip-rule="evenodd" d="M3 4.914C3 3.857 3.857 3 4.914 3h6.514c.884 0 1.628.6 1.848 1.414a5.171 5.171 0 0 1 7.31 7.31c.815.22 1.414.964 1.414 1.848v6.514A1.914 1.914 0 0 1 20.086 22H4.914A1.914 1.914 0 0 1 3 20.086V4.914Zm3.016 1.102v4.31h4.31v-4.31h-4.31Zm0 12.968v-4.31h4.31v4.31h-4.31Zm8.658 0v-4.31h4.31v4.31h-4.31Zm0-10.813a2.155 2.155 0 1 1 4.31 0 2.155 2.155 0 0 1-4.31 0Z" fill="currentColor"></path><path opacity=".25" d="M16.829 6.016a2.155 2.155 0 1 0 0 4.31 2.155 2.155 0 0 0 0-4.31Z" fill="currentColor"></path></svg>
+					Spaces</a>
+			</li><li><a class="group flex items-center px-2 py-0.5 dark:hover:text-gray-400 hover:text-yellow-700" href="/docs"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="mr-1.5 text-gray-400 group-hover:text-yellow-500" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path opacity="0.5" d="M20.9022 5.10334L10.8012 10.8791L7.76318 9.11193C8.07741 8.56791 8.5256 8.11332 9.06512 7.7914L15.9336 3.73907C17.0868 3.08811 18.5002 3.26422 19.6534 3.91519L19.3859 3.73911C19.9253 4.06087 20.5879 4.56025 20.9022 5.10334Z" fill="currentColor"></path><path d="M10.7999 10.8792V28.5483C10.2136 28.5475 9.63494 28.4139 9.10745 28.1578C8.5429 27.8312 8.074 27.3621 7.74761 26.7975C7.42122 26.2327 7.24878 25.5923 7.24756 24.9402V10.9908C7.25062 10.3319 7.42358 9.68487 7.74973 9.1123L10.7999 10.8792Z" fill="currentColor" fill-opacity="0.75"></path><path fill-rule="evenodd" clip-rule="evenodd" d="M21.3368 10.8499V6.918C21.3331 6.25959 21.16 5.61234 20.8346 5.03949L10.7971 10.8727L10.8046 10.874L21.3368 10.8499Z" fill="currentColor"></path><path opacity="0.5" d="M21.7937 10.8488L10.7825 10.8741V28.5486L21.7937 28.5234C23.3344 28.5234 24.5835 27.2743 24.5835 25.7335V13.6387C24.5835 12.0979 23.4365 11.1233 21.7937 10.8488Z" fill="currentColor"></path></svg>
+					Docs</a>
+			</li>
+		<li><div class="relative ">
+	<button class="px-2 py-0.5 group hover:text-green-700 dark:hover:text-gray-400 flex items-center " type="button">
+		<svg class="mr-1.5 text-gray-400 group-hover:text-green-500" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><path class="uim-tertiary" d="M19 6H5a3 3 0 0 0-3 3v2.72L8.837 14h6.326L22 11.72V9a3 3 0 0 0-3-3z" opacity=".5" fill="currentColor"></path><path class="uim-primary" d="M10 6V5h4v1h2V5a2.002 2.002 0 0 0-2-2h-4a2.002 2.002 0 0 0-2 2v1h2zm-1.163 8L2 11.72V18a3.003 3.003 0 0 0 3 3h14a3.003 3.003 0 0 0 3-3v-6.28L15.163 14H8.837z" fill="currentColor"></path></svg>
+			Solutions
+		</button>
+	</div></li>
+		<li><a class="group flex items-center px-2 py-0.5 hover:text-gray-500 dark:hover:text-gray-400" href="/pricing">Pricing
+			</a></li>
+		<li><div class="relative group">
+	<button class="px-2 py-0.5 hover:text-gray-500 dark:hover:text-gray-600 flex items-center " type="button">
+		<svg class="mr-1.5 text-gray-500 w-5 group-hover:text-gray-400 dark:text-gray-300 dark:group-hover:text-gray-400" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" viewBox="0 0 32 18" preserveAspectRatio="xMidYMid meet"><path fill-rule="evenodd" clip-rule="evenodd" d="M14.4504 3.30221C14.4504 2.836 14.8284 2.45807 15.2946 2.45807H28.4933C28.9595 2.45807 29.3374 2.836 29.3374 3.30221C29.3374 3.76842 28.9595 4.14635 28.4933 4.14635H15.2946C14.8284 4.14635 14.4504 3.76842 14.4504 3.30221Z" fill="currentColor"></path><path fill-rule="evenodd" clip-rule="evenodd" d="M14.4504 9.00002C14.4504 8.53382 14.8284 8.15588 15.2946 8.15588H28.4933C28.9595 8.15588 29.3374 8.53382 29.3374 9.00002C29.3374 9.46623 28.9595 9.84417 28.4933 9.84417H15.2946C14.8284 9.84417 14.4504 9.46623 14.4504 9.00002Z" fill="currentColor"></path><path fill-rule="evenodd" clip-rule="evenodd" d="M14.4504 14.6978C14.4504 14.2316 14.8284 13.8537 15.2946 13.8537H28.4933C28.9595 13.8537 29.3374 14.2316 29.3374 14.6978C29.3374 15.164 28.9595 15.542 28.4933 15.542H15.2946C14.8284 15.542 14.4504 15.164 14.4504 14.6978Z" fill="currentColor"></path><path fill-rule="evenodd" clip-rule="evenodd" d="M1.94549 6.87377C2.27514 6.54411 2.80962 6.54411 3.13928 6.87377L6.23458 9.96907L9.32988 6.87377C9.65954 6.54411 10.194 6.54411 10.5237 6.87377C10.8533 7.20343 10.8533 7.73791 10.5237 8.06756L6.23458 12.3567L1.94549 8.06756C1.61583 7.73791 1.61583 7.20343 1.94549 6.87377Z" fill="currentColor"></path></svg>
+		</button>
+	</div></li>
+		<li><hr class="h-5 w-0.5 border-none bg-gray-100 dark:bg-gray-800"></li>
+		<li><a class="block cursor-pointer px-2 py-0.5 hover:text-gray-500 dark:hover:text-gray-400" href="/login">Log In
+				</a></li>
+			<li><a class="rounded-full border border-transparent bg-gray-900 px-3 py-1 leading-none text-white hover:border-black hover:bg-white hover:text-black" href="/join">Sign Up
+					</a></li></ul></nav></div></header></div>
+	<div class="SVELTE_HYDRATER contents" data-props="{}" data-target="GoogleAnalyticsTracker"></div>
+	<div class="SVELTE_HYDRATER contents" data-props="{}" data-target="SSOBanner"></div>
+	<main class="flex flex-1 flex-col"><div class="SVELTE_HYDRATER contents" data-props="{&quot;activeTab&quot;:&quot;files&quot;,&quot;author&quot;:{&quot;avatarUrl&quot;:&quot;/avatars/100f5ae3cf3c52faaecdaecd5d8f2881.svg&quot;,&quot;fullname&quot;:&quot;Haotian Zhang&quot;,&quot;name&quot;:&quot;haotiz&quot;,&quot;type&quot;:&quot;user&quot;,&quot;isPro&quot;:false,&quot;isHf&quot;:false},&quot;canReadRepoSettings&quot;:false,&quot;canDisable&quot;:false,&quot;discussionsStats&quot;:{&quot;closed&quot;:0,&quot;open&quot;:2,&quot;total&quot;:2},&quot;query&quot;:{},&quot;space&quot;:{&quot;author&quot;:&quot;haotiz&quot;,&quot;colorFrom&quot;:&quot;indigo&quot;,&quot;colorTo&quot;:&quot;indigo&quot;,&quot;cardData&quot;:{&quot;title&quot;:&quot;Glip Zeroshot Demo&quot;,&quot;emoji&quot;:&quot;⚡&quot;,&quot;colorFrom&quot;:&quot;indigo&quot;,&quot;colorTo&quot;:&quot;indigo&quot;,&quot;sdk&quot;:&quot;gradio&quot;,&quot;sdk_version&quot;:3.3,&quot;app_file&quot;:&quot;app.py&quot;,&quot;pinned&quot;:false,&quot;license&quot;:&quot;mit&quot;},&quot;emoji&quot;:&quot;⚡&quot;,&quot;discussionsDisabled&quot;:false,&quot;duplicationDisabled&quot;:false,&quot;id&quot;:&quot;haotiz/glip-zeroshot-demo&quot;,&quot;isLikedByUser&quot;:false,&quot;isWatchedByUser&quot;:false,&quot;lastModified&quot;:&quot;2023-01-20T22:07:13.000Z&quot;,&quot;likes&quot;:70,&quot;pinned&quot;:false,&quot;private&quot;:false,&quot;gated&quot;:false,&quot;repoType&quot;:&quot;space&quot;,&quot;subdomain&quot;:&quot;haotiz-glip-zeroshot-demo&quot;,&quot;sdk&quot;:&quot;gradio&quot;,&quot;sdkVersion&quot;:3.3,&quot;title&quot;:&quot;Glip Zeroshot Demo&quot;,&quot;runtime&quot;:{&quot;stage&quot;:&quot;BUILD_ERROR&quot;,&quot;hardware&quot;:{&quot;current&quot;:null,&quot;requested&quot;:&quot;t4-small&quot;},&quot;storage&quot;:null,&quot;gcTimeout&quot;:3600,&quot;errorMessage&quot;:&quot;Build failed with exit code: 1&quot;,&quot;replicas&quot;:{&quot;current&quot;:1,&quot;requested&quot;:1}},&quot;iframe&quot;:{&quot;host&quot;:&quot;https://haotiz-glip-zeroshot-demo.hf.space&quot;,&quot;src&quot;:&quot;https://haotiz-glip-zeroshot-demo.hf.space&quot;},&quot;secrets&quot;:[],&quot;variables&quot;:[],&quot;sse&quot;:{&quot;url&quot;:&quot;https://api.hf.space/v1/haotiz/glip-zeroshot-demo&quot;,&quot;jwt&quot;:&quot;eyJhbGciOiJFZERTQSJ9.eyJyZWFkIjp0cnVlLCJpYXQiOjE3MDE1NjE0NjgsInN1YiI6Ii9zcGFjZXMvaGFvdGl6L2dsaXAtemVyb3Nob3QtZGVtbyIsImV4cCI6MTcwMTY0Nzg2OCwiaXNzIjoiaHR0cHM6Ly9odWdnaW5nZmFjZS5jbyJ9._Sgi9vT1KIwQ79sdMBkDyj2u-e_9-IiYHbX_o8saV71naPWywLpwN_R7RxTDCTSM-xT1NLSLUP-KpiduX2z4Ag&quot;},&quot;linkedModels&quot;:[{&quot;downloads&quot;:55164380,&quot;gated&quot;:false,&quot;id&quot;:&quot;bert-base-uncased&quot;,&quot;lastModified&quot;:&quot;2023-06-30T01:42:19.000Z&quot;,&quot;likes&quot;:1221,&quot;pipeline_tag&quot;:&quot;fill-mask&quot;,&quot;private&quot;:false,&quot;repoType&quot;:&quot;model&quot;,&quot;isLikedByUser&quot;:false},{&quot;downloads&quot;:15091479,&quot;gated&quot;:false,&quot;id&quot;:&quot;roberta-base&quot;,&quot;lastModified&quot;:&quot;2023-03-06T15:14:53.000Z&quot;,&quot;likes&quot;:247,&quot;pipeline_tag&quot;:&quot;fill-mask&quot;,&quot;private&quot;:false,&quot;repoType&quot;:&quot;model&quot;,&quot;isLikedByUser&quot;:false},{&quot;author&quot;:&quot;openai&quot;,&quot;authorData&quot;:{&quot;avatarUrl&quot;:&quot;https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1620805164087-5ec0135ded25d76864d553f1.png?w=200&amp;h=200&amp;f=face&quot;,&quot;fullname&quot;:&quot;OpenAI&quot;,&quot;name&quot;:&quot;openai&quot;,&quot;type&quot;:&quot;org&quot;,&quot;isHf&quot;:false,&quot;isEnterprise&quot;:false},&quot;downloads&quot;:8258779,&quot;gated&quot;:false,&quot;id&quot;:&quot;openai/clip-vit-base-patch32&quot;,&quot;lastModified&quot;:&quot;2022-10-04T09:42:04.000Z&quot;,&quot;likes&quot;:277,&quot;pipeline_tag&quot;:&quot;zero-shot-image-classification&quot;,&quot;private&quot;:false,&quot;repoType&quot;:&quot;model&quot;,&quot;isLikedByUser&quot;:false},{&quot;downloads&quot;:5298681,&quot;gated&quot;:false,&quot;id&quot;:&quot;bert-base-cased&quot;,&quot;lastModified&quot;:&quot;2022-11-16T15:18:28.000Z&quot;,&quot;likes&quot;:163,&quot;pipeline_tag&quot;:&quot;fill-mask&quot;,&quot;private&quot;:false,&quot;repoType&quot;:&quot;model&quot;,&quot;isLikedByUser&quot;:false}],&quot;linkedDatasets&quot;:[],&quot;linkedCollections&quot;:[]}}" data-target="SpaceHeader"><header class="from-gray-50-to-white border-b border-gray-100 bg-gradient-to-t via-white dark:via-gray-950 pt-4 xl:pt-0"><div class="container relative flex flex-col xl:flex-row"><h1 class="flex flex-wrap items-center leading-tight gap-y-1 text-lg xl:flex-none"><a href="/spaces" class="group flex items-center"><svg class="mr-1 text-gray-400" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M7.80914 18.7462V24.1907H13.2536V18.7462H7.80914Z" fill="#FF3270"></path><path d="M18.7458 18.7462V24.1907H24.1903V18.7462H18.7458Z" fill="#861FFF"></path><path d="M7.80914 7.80982V13.2543H13.2536V7.80982H7.80914Z" fill="#097EFF"></path><path fill-rule="evenodd" clip-rule="evenodd" d="M4 6.41775C4 5.08246 5.08246 4 6.41775 4H14.6457C15.7626 4 16.7026 4.75724 16.9802 5.78629C18.1505 4.67902 19.7302 4 21.4685 4C25.0758 4 28.0003 6.92436 28.0003 10.5317C28.0003 12.27 27.3212 13.8497 26.2139 15.02C27.243 15.2977 28.0003 16.2376 28.0003 17.3545V25.5824C28.0003 26.9177 26.9177 28.0003 25.5824 28.0003H17.0635H14.9367H6.41775C5.08246 28.0003 4 26.9177 4 25.5824V15.1587V14.9367V6.41775ZM7.80952 7.80952V13.254H13.254V7.80952H7.80952ZM7.80952 24.1907V18.7462H13.254V24.1907H7.80952ZM18.7462 24.1907V18.7462H24.1907V24.1907H18.7462ZM18.7462 10.5317C18.7462 9.0283 19.9651 7.80952 21.4685 7.80952C22.9719 7.80952 24.1907 9.0283 24.1907 10.5317C24.1907 12.0352 22.9719 13.254 21.4685 13.254C19.9651 13.254 18.7462 12.0352 18.7462 10.5317Z" fill="black"></path><path d="M21.4681 7.80982C19.9647 7.80982 18.7458 9.02861 18.7458 10.5321C18.7458 12.0355 19.9647 13.2543 21.4681 13.2543C22.9715 13.2543 24.1903 12.0355 24.1903 10.5321C24.1903 9.02861 22.9715 7.80982 21.4681 7.80982Z" fill="#FFD702"></path></svg>
+					<span class="mr-2.5 font-semibold text-gray-400 group-hover:text-gray-500">Spaces:</span></a>
+			<div class="group flex flex-none items-center"><div class="relative mr-1.5 flex items-center">
+			<img alt="" class="w-3.5 h-3.5 rounded-full " src="/avatars/100f5ae3cf3c52faaecdaecd5d8f2881.svg" crossorigin="anonymous"></div>
+		<a href="/haotiz" class="text-gray-400 hover:text-blue-600">haotiz</a>
+		<div class="mx-0.5 text-gray-300">/</div></div>
+<div class="max-w-full xl:flex xl:min-w-0 xl:flex-nowrap xl:items-center xl:gap-x-1"><a class="break-words font-mono font-semibold hover:text-blue-600 text-[1.07rem] xl:truncate" href="/spaces/haotiz/glip-zeroshot-demo">glip-zeroshot-demo</a>
+	<button class="relative text-xs mr-3  inline-flex cursor-pointer items-center text-sm focus:outline-none  mx-0.5   text-gray-600 " title="Copy space name to clipboard" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
+	</button></div>
+			<div class="inline-flex items-center overflow-hidden whitespace-nowrap rounded-md border bg-white text-sm leading-none text-gray-500  mr-2"><button class="relative flex items-center px-1.5 py-1 hover:bg-gradient-to-t focus:outline-none overflow-hidden from-red-50 to-transparent dark:from-red-900 dark:to-red-800"  title="Like"><svg class="left-1.5 absolute" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32" fill="currentColor"><path d="M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13,5.64,15.64a5.7,5.7,0,0,1,0-8,5.48,5.48,0,0,1,7.82,0L16,10.24l2.53-2.58A5.44,5.44,0,0,1,22.45,6m0-2a7.47,7.47,0,0,0-5.34,2.24L16,7.36,14.89,6.24a7.49,7.49,0,0,0-10.68,0,7.72,7.72,0,0,0,0,10.82L16,29,27.79,17.06a7.72,7.72,0,0,0,0-10.82A7.49,7.49,0,0,0,22.45,4Z"></path></svg>
+		<span class="ml-4 pl-0.5">like</span></button>
+	<button class="flex items-center border-l px-1.5 py-1 text-gray-400 hover:bg-gray-50 focus:bg-gray-100 focus:outline-none dark:hover:bg-gray-900 dark:focus:bg-gray-800" title="See users who liked this repository">70</button></div>
+<div class="inline-flex select-none items-center overflow-hidden font-mono text-xs flex-shrink-0 mr-2 rounded-lg border leading-none dark:bg-gray-900
+					border-red-100
+					text-red-700 dark:text-red-500"><div class="inline-flex items-center px-2 py-[0.32rem] dark:bg-gray-900
+						border-red-100
+						bg-red-50">
+			Build error
+			</div>
+		</div>
+<div class="sm:hidden"><div class="relative ">
+	<button class="btn px-1 py-1 text-sm translate-y-0 " type="button">
+			<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="p-px" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><circle cx="16" cy="7" r="3" fill="currentColor"></circle><circle cx="16" cy="16" r="3" fill="currentColor"></circle><circle cx="16" cy="25" r="3" fill="currentColor"></circle></svg>
+			<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" class="absolute right-[-0.25rem] bottom-[-0.25rem]  rounded-sm bg-gray-50 p-px text-[0.85rem] text-gray-500 dark:bg-gray-925" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 12 12"><path fill="currentColor" d="M7.975 3.489a.438.438 0 0 1 0 .618L4.262 7.82a.416.416 0 0 1-.307.126.427.427 0 0 1-.311-.126.438.438 0 0 1 0-.618L7.357 3.49a.438.438 0 0 1 .618 0ZM6.427 8.132 4.88 9.675a2.17 2.17 0 0 1-3.09 0 2.188 2.188 0 0 1 0-3.09l1.542-1.548a.437.437 0 0 0-.618-.619L1.166 5.966a3.063 3.063 0 0 0 4.332 4.332L7.046 8.75a.438.438 0 0 0-.619-.618Zm4.026-7.121a3.063 3.063 0 0 0-4.332 0L4.573 2.559a.438.438 0 0 0 .618.618L6.74 1.635a2.171 2.171 0 0 1 3.09 0 2.188 2.188 0 0 1 0 3.09L8.287 6.273a.432.432 0 0 0 0 .618.421.421 0 0 0 .475.097.438.438 0 0 0 .143-.097l1.548-1.548a3.068 3.068 0 0 0 0-4.332Z"></path></svg>
+		</button>
+	</div></div></h1>
+		<div class="flex flex-col-reverse gap-x-2 sm:flex-row sm:items-center sm:justify-between xl:ml-auto"><div class="-mb-px flex h-12 items-center overflow-x-auto overflow-y-hidden sm:h-[3.25rem]"><a class="tab-alternate " href="/spaces/haotiz/glip-zeroshot-demo"><svg class="mr-1.5 text-gray-400 flex-none" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><path class="uim-quaternary" d="M20.23 7.24L12 12L3.77 7.24a1.98 1.98 0 0 1 .7-.71L11 2.76c.62-.35 1.38-.35 2 0l6.53 3.77c.29.173.531.418.7.71z" opacity=".25" fill="currentColor"></path><path class="uim-tertiary" d="M12 12v9.5a2.09 2.09 0 0 1-.91-.21L4.5 17.48a2.003 2.003 0 0 1-1-1.73v-7.5a2.06 2.06 0 0 1 .27-1.01L12 12z" opacity=".5" fill="currentColor"></path><path class="uim-primary" d="M20.5 8.25v7.5a2.003 2.003 0 0 1-1 1.73l-6.62 3.82c-.275.13-.576.198-.88.2V12l8.23-4.76c.175.308.268.656.27 1.01z" fill="currentColor"></path></svg>
+			App
+		</a><a class="tab-alternate active" href="/spaces/haotiz/glip-zeroshot-demo/tree/main"><svg class="mr-1.5 text-gray-400 flex-none" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><path class="uim-tertiary" d="M21 19h-8a1 1 0 0 1 0-2h8a1 1 0 0 1 0 2zm0-4h-8a1 1 0 0 1 0-2h8a1 1 0 0 1 0 2zm0-8h-8a1 1 0 0 1 0-2h8a1 1 0 0 1 0 2zm0 4h-8a1 1 0 0 1 0-2h8a1 1 0 0 1 0 2z" opacity=".5" fill="currentColor"></path><path class="uim-primary" d="M9 19a1 1 0 0 1-1-1V6a1 1 0 0 1 2 0v12a1 1 0 0 1-1 1zm-6-4.333a1 1 0 0 1-.64-1.769L3.438 12l-1.078-.898a1 1 0 0 1 1.28-1.538l2 1.667a1 1 0 0 1 0 1.538l-2 1.667a.999.999 0 0 1-.64.231z" fill="currentColor"></path></svg>
+			<span class="xl:hidden">Files</span>
+				<span class="hidden xl:inline">Files</span>
+		</a><a class="tab-alternate " href="/spaces/haotiz/glip-zeroshot-demo/discussions"><svg class="mr-1.5 text-gray-400 flex-none" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M20.6081 3C21.7684 3 22.8053 3.49196 23.5284 4.38415C23.9756 4.93678 24.4428 5.82749 24.4808 7.16133C24.9674 7.01707 25.4353 6.93643 25.8725 6.93643C26.9833 6.93643 27.9865 7.37587 28.696 8.17411C29.6075 9.19872 30.0124 10.4579 29.8361 11.7177C29.7523 12.3177 29.5581 12.8555 29.2678 13.3534C29.8798 13.8646 30.3306 14.5763 30.5485 15.4322C30.719 16.1032 30.8939 17.5006 29.9808 18.9403C30.0389 19.0342 30.0934 19.1319 30.1442 19.2318C30.6932 20.3074 30.7283 21.5229 30.2439 22.6548C29.5093 24.3704 27.6841 25.7219 24.1397 27.1727C21.9347 28.0753 19.9174 28.6523 19.8994 28.6575C16.9842 29.4379 14.3477 29.8345 12.0653 29.8345C7.87017 29.8345 4.8668 28.508 3.13831 25.8921C0.356375 21.6797 0.754104 17.8269 4.35369 14.1131C6.34591 12.058 7.67023 9.02782 7.94613 8.36275C8.50224 6.39343 9.97271 4.20438 12.4172 4.20438H12.4179C12.6236 4.20438 12.8314 4.2214 13.0364 4.25468C14.107 4.42854 15.0428 5.06476 15.7115 6.02205C16.4331 5.09583 17.134 4.359 17.7682 3.94323C18.7242 3.31737 19.6794 3 20.6081 3ZM20.6081 5.95917C20.2427 5.95917 19.7963 6.1197 19.3039 6.44225C17.7754 7.44319 14.8258 12.6772 13.7458 14.7131C13.3839 15.3952 12.7655 15.6837 12.2086 15.6837C11.1036 15.6837 10.2408 14.5497 12.1076 13.1085C14.9146 10.9402 13.9299 7.39584 12.5898 7.1776C12.5311 7.16799 12.4731 7.16355 12.4172 7.16355C11.1989 7.16355 10.6615 9.33114 10.6615 9.33114C10.6615 9.33114 9.0863 13.4148 6.38031 16.206C3.67434 18.998 3.5346 21.2388 5.50675 24.2246C6.85185 26.2606 9.42666 26.8753 12.0653 26.8753C14.8021 26.8753 17.6077 26.2139 19.1799 25.793C19.2574 25.7723 28.8193 22.984 27.6081 20.6107C27.4046 20.212 27.0693 20.0522 26.6471 20.0522C24.9416 20.0522 21.8393 22.6726 20.5057 22.6726C20.2076 22.6726 19.9976 22.5416 19.9116 22.222C19.3433 20.1173 28.552 19.2325 27.7758 16.1839C27.639 15.6445 27.2677 15.4256 26.746 15.4263C24.4923 15.4263 19.4358 19.5181 18.3759 19.5181C18.2949 19.5181 18.2368 19.4937 18.2053 19.4419C17.6743 18.557 17.9653 17.9394 21.7082 15.6009C25.4511 13.2617 28.0783 11.8545 26.5841 10.1752C26.4121 9.98141 26.1684 9.8956 25.8725 9.8956C23.6001 9.89634 18.2311 14.9403 18.2311 14.9403C18.2311 14.9403 16.7821 16.496 15.9057 16.496C15.7043 16.496 15.533 16.4139 15.4169 16.2112C14.7956 15.1296 21.1879 10.1286 21.5484 8.06535C21.7928 6.66715 21.3771 5.95917 20.6081 5.95917Z" fill="#FF9D00"></path><path d="M5.50686 24.2246C3.53472 21.2387 3.67446 18.9979 6.38043 16.206C9.08641 13.4147 10.6615 9.33111 10.6615 9.33111C10.6615 9.33111 11.2499 6.95933 12.59 7.17757C13.93 7.39581 14.9139 10.9401 12.1069 13.1084C9.29997 15.276 12.6659 16.7489 13.7459 14.713C14.8258 12.6772 17.7747 7.44316 19.304 6.44221C20.8326 5.44128 21.9089 6.00204 21.5484 8.06532C21.188 10.1286 14.795 15.1295 15.4171 16.2118C16.0391 17.2934 18.2312 14.9402 18.2312 14.9402C18.2312 14.9402 25.0907 8.49588 26.5842 10.1752C28.0776 11.8545 25.4512 13.2616 21.7082 15.6008C17.9646 17.9393 17.6744 18.557 18.2054 19.4418C18.7372 20.3266 26.9998 13.1351 27.7759 16.1838C28.5513 19.2324 19.3434 20.1173 19.9117 22.2219C20.48 24.3274 26.3979 18.2382 27.6082 20.6107C28.8193 22.9839 19.2574 25.7722 19.18 25.7929C16.0914 26.62 8.24723 28.3726 5.50686 24.2246Z" fill="#FFD21E"></path></svg>
+			Community
+			<div class="ml-1.5 flex h-4 min-w-[1rem] items-center justify-center rounded px-1 text-xs leading-none shadow-sm bg-black text-white dark:bg-gray-800 dark:text-gray-200">2
+				</div>
+		</a>
+	</div>
+<div class="hidden sm:block mt-2 lg:mt-0"><div class="relative ">
+	<button class="btn px-1 py-1 text-base translate-y-px " type="button">
+			<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="p-0.5" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><circle cx="16" cy="7" r="3" fill="currentColor"></circle><circle cx="16" cy="16" r="3" fill="currentColor"></circle><circle cx="16" cy="25" r="3" fill="currentColor"></circle></svg>
+			<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" class="absolute right-[-0.18rem] bottom-[-0.18rem]  rounded-sm bg-gray-50 p-px text-[0.85rem] text-gray-500 dark:bg-gray-925" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 12 12"><path fill="currentColor" d="M7.975 3.489a.438.438 0 0 1 0 .618L4.262 7.82a.416.416 0 0 1-.307.126.427.427 0 0 1-.311-.126.438.438 0 0 1 0-.618L7.357 3.49a.438.438 0 0 1 .618 0ZM6.427 8.132 4.88 9.675a2.17 2.17 0 0 1-3.09 0 2.188 2.188 0 0 1 0-3.09l1.542-1.548a.437.437 0 0 0-.618-.619L1.166 5.966a3.063 3.063 0 0 0 4.332 4.332L7.046 8.75a.438.438 0 0 0-.619-.618Zm4.026-7.121a3.063 3.063 0 0 0-4.332 0L4.573 2.559a.438.438 0 0 0 .618.618L6.74 1.635a2.171 2.171 0 0 1 3.09 0 2.188 2.188 0 0 1 0 3.09L8.287 6.273a.432.432 0 0 0 0 .618.421.421 0 0 0 .475.097.438.438 0 0 0 .143-.097l1.548-1.548a3.068 3.068 0 0 0 0-4.332Z"></path></svg>
+		</button>
+	</div></div>
+		</div></div></header>
+</div>
+<div class="container relative flex flex-col md:grid md:space-y-0 w-full md:grid-cols-12  space-y-4 md:gap-6 mb-16"><section class="pt-8 border-gray-100 col-span-full"><header class="flex flex-wrap items-center justify-start pb-2 md:justify-end lg:flex-nowrap"><div class="mr-4 flex min-w-0 basis-auto flex-wrap items-center md:flex-grow md:basis-full lg:basis-auto lg:flex-nowrap"><div class="SVELTE_HYDRATER contents" data-props="{&quot;path&quot;:&quot;docs/intro.md&quot;,&quot;repoName&quot;:&quot;haotiz/glip-zeroshot-demo&quot;,&quot;repoType&quot;:&quot;space&quot;,&quot;rev&quot;:&quot;main&quot;,&quot;refs&quot;:{&quot;branches&quot;:[{&quot;name&quot;:&quot;main&quot;,&quot;ref&quot;:&quot;refs/heads/main&quot;,&quot;targetCommit&quot;:&quot;7f799e88e07dd635fe84c11e57e3f6a08b59b911&quot;}],&quot;tags&quot;:[],&quot;converts&quot;:[]},&quot;view&quot;:&quot;blob&quot;}" data-target="BranchSelector"><div class="relative mr-4 mb-2">
+	<button class="text-sm md:text-base btn w-full cursor-pointer text-sm" type="button">
+		<svg class="mr-1.5 text-gray-700 dark:text-gray-400" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24" style="transform: rotate(360deg);"><path d="M13 14c-3.36 0-4.46 1.35-4.82 2.24C9.25 16.7 10 17.76 10 19a3 3 0 0 1-3 3a3 3 0 0 1-3-3c0-1.31.83-2.42 2-2.83V7.83A2.99 2.99 0 0 1 4 5a3 3 0 0 1 3-3a3 3 0 0 1 3 3c0 1.31-.83 2.42-2 2.83v5.29c.88-.65 2.16-1.12 4-1.12c2.67 0 3.56-1.34 3.85-2.23A3.006 3.006 0 0 1 14 7a3 3 0 0 1 3-3a3 3 0 0 1 3 3c0 1.34-.88 2.5-2.09 2.86C17.65 11.29 16.68 14 13 14m-6 4a1 1 0 0 0-1 1a1 1 0 0 0 1 1a1 1 0 0 0 1-1a1 1 0 0 0-1-1M7 4a1 1 0 0 0-1 1a1 1 0 0 0 1 1a1 1 0 0 0 1-1a1 1 0 0 0-1-1m10 2a1 1 0 0 0-1 1a1 1 0 0 0 1 1a1 1 0 0 0 1-1a1 1 0 0 0-1-1z" fill="currentColor"></path></svg>
+			main
+		<svg class="-mr-1 text-gray-500" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><path d="M16.293 9.293L12 13.586L7.707 9.293l-1.414 1.414L12 16.414l5.707-5.707z" fill="currentColor"></path></svg></button>
+	</div></div>
+		<div class="mb-2 flex items-center overflow-hidden"><a class="truncate text-gray-800 hover:underline" href="/spaces/haotiz/glip-zeroshot-demo/tree/main">glip-zeroshot-demo</a>
+			<span class="mx-1 text-gray-300">/</span>
+				<a class="truncate hover:underline dark:text-gray-300" href="/spaces/haotiz/glip-zeroshot-demo/tree/main/docs">docs
+							</a>
+						<span class="mx-1 text-gray-300">/</span><span class="dark:text-gray-300">intro.md</span></div></div>
+	</header>
+			<div class="SVELTE_HYDRATER contents" data-props="{&quot;commitLast&quot;:{&quot;date&quot;:&quot;2022-09-09T18:53:48.000Z&quot;,&quot;subject&quot;:&quot;initial commit&quot;,&quot;authors&quot;:[{&quot;_id&quot;:&quot;631516348d85ad332fa47b2c&quot;,&quot;avatar&quot;:&quot;/avatars/100f5ae3cf3c52faaecdaecd5d8f2881.svg&quot;,&quot;isHf&quot;:false,&quot;user&quot;:&quot;haotiz&quot;}],&quot;commit&quot;:{&quot;id&quot;:&quot;708dec4d8a2103c25db9eb7c24720af2dccfe72d&quot;,&quot;parentIds&quot;:[&quot;321aba1bb0f2180bfd7f7000fc7cc96699e74092&quot;]},&quot;title&quot;:&quot;initial commit&quot;},&quot;repo&quot;:{&quot;name&quot;:&quot;haotiz/glip-zeroshot-demo&quot;,&quot;type&quot;:&quot;space&quot;}}" data-target="LastCommit"><div class="from-gray-100-to-white flex items-baseline rounded-t-lg border border-b-0 bg-gradient-to-t px-3 py-2 dark:border-gray-800"><img class="mr-2.5 mt-0.5 h-4 w-4 self-center rounded-full" alt="haotiz's picture" src="/avatars/100f5ae3cf3c52faaecdaecd5d8f2881.svg">
+			<div class="mr-5 flex flex-none items-center truncate"><a class="hover:underline" href="/haotiz">haotiz
+					</a>
+			</div>
+		<div class="mr-4 truncate font-mono text-sm text-gray-500 hover:prose-a:underline"><!-- HTML_TAG_START -->initial commit<!-- HTML_TAG_END --></div>
+		<a class="rounded border bg-gray-50 px-1.5 text-sm hover:underline dark:border-gray-800 dark:bg-gray-900" href="/spaces/haotiz/glip-zeroshot-demo/commit/708dec4d8a2103c25db9eb7c24720af2dccfe72d">708dec4</a>
+		<time class="ml-auto hidden flex-none truncate pl-2 text-gray-500 dark:text-gray-400 lg:block" datetime="2022-09-09T18:53:48" title="Fri, 09 Sep 2022 18:53:48 GMT">about 1 year ago</time></div></div>
+			<div class="flex flex-wrap items-center border px-3 py-1.5 text-sm text-gray-800 dark:border-gray-800 dark:bg-gray-900"><div class="flex items-center gap-3 text-sm font-medium"><a class="rounded-md px-1.5 capitalize bg-gray-200 dark:bg-gray-800" href="/spaces/haotiz/glip-zeroshot-demo/blob/main/docs/intro.md">preview</a>
+						<a class="rounded-md px-1.5 capitalize " href="/spaces/haotiz/glip-zeroshot-demo/blob/main/docs/intro.md?code=true">code</a></div>
+					<div class="mx-4 text-gray-200">|</div>
+				<a class="my-1 mr-4 flex items-center hover:underline " href="/spaces/haotiz/glip-zeroshot-demo/raw/main/docs/intro.md"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32" style="transform: rotate(360deg);"><path d="M31 16l-7 7l-1.41-1.41L28.17 16l-5.58-5.59L24 9l7 7z" fill="currentColor"></path><path d="M1 16l7-7l1.41 1.41L3.83 16l5.58 5.59L8 23l-7-7z" fill="currentColor"></path><path d="M12.419 25.484L17.639 6l1.932.518L14.35 26z" fill="currentColor"></path></svg>
+							raw
+						</a><a class="my-1 mr-4 flex items-center hover:underline " href="/spaces/haotiz/glip-zeroshot-demo/commits/main/docs/intro.md"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32" style="transform: rotate(360deg);"><path d="M16 4C9.383 4 4 9.383 4 16s5.383 12 12 12s12-5.383 12-12S22.617 4 16 4zm0 2c5.535 0 10 4.465 10 10s-4.465 10-10 10S6 21.535 6 16S10.465 6 16 6zm-1 2v9h7v-2h-5V8z" fill="currentColor"></path></svg>
+							history
+						</a><a class="my-1 mr-4 flex items-center hover:underline " href="/spaces/haotiz/glip-zeroshot-demo/blame/main/docs/intro.md"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32" style="transform: rotate(360deg);"><path d="M16 2a14 14 0 1 0 14 14A14 14 0 0 0 16 2zm0 26a12 12 0 1 1 12-12a12 12 0 0 1-12 12z" fill="currentColor"></path><path d="M11.5 11a2.5 2.5 0 1 0 2.5 2.5a2.48 2.48 0 0 0-2.5-2.5z" fill="currentColor"></path><path d="M20.5 11a2.5 2.5 0 1 0 2.5 2.5a2.48 2.48 0 0 0-2.5-2.5z" fill="currentColor"></path></svg>
+							blame
+						</a><a class="my-1 mr-4 flex items-center hover:underline text-green-600 dark:text-gray-300" href="/spaces/haotiz/glip-zeroshot-demo/edit/main/docs/intro.md"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M2 26h28v2H2z" fill="currentColor"></path><path d="M25.4 9c.8-.8.8-2 0-2.8l-3.6-3.6c-.8-.8-2-.8-2.8 0l-15 15V24h6.4l15-15zm-5-5L24 7.6l-3 3L17.4 7l3-3zM6 22v-3.6l10-10l3.6 3.6l-10 10H6z" fill="currentColor"></path></svg>
+							contribute
+						</a><a class="my-1 mr-4 flex items-center hover:underline " href="/spaces/haotiz/glip-zeroshot-demo/delete/main/docs/intro.md"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M12 12h2v12h-2z" fill="currentColor"></path><path d="M18 12h2v12h-2z" fill="currentColor"></path><path d="M4 6v2h2v20a2 2 0 0 0 2 2h16a2 2 0 0 0 2-2V8h2V6zm4 22V8h16v20z" fill="currentColor"></path><path d="M12 2h8v2h-8z" fill="currentColor"></path></svg>
+							delete
+						</a>
+				<div class="mr-4 flex items-center text-gray-400"><svg class="text-gray-300 text-sm mr-1.5 -translate-y-px" width="1em" height="1em" viewBox="0 0 22 28" fill="none" xmlns="http://www.w3.org/2000/svg"><path fill-rule="evenodd" clip-rule="evenodd" d="M15.3634 10.3639C15.8486 10.8491 15.8486 11.6357 15.3634 12.1209L10.9292 16.5551C10.6058 16.8785 10.0814 16.8785 9.7579 16.5551L7.03051 13.8277C6.54532 13.3425 6.54532 12.5558 7.03051 12.0707C7.51569 11.5855 8.30234 11.5855 8.78752 12.0707L9.7579 13.041C10.0814 13.3645 10.6058 13.3645 10.9292 13.041L13.6064 10.3639C14.0916 9.8787 14.8782 9.8787 15.3634 10.3639Z" fill="currentColor"></path><path fill-rule="evenodd" clip-rule="evenodd" d="M10.6666 27.12C4.93329 25.28 0 19.2267 0 12.7867V6.52001C0 5.40001 0.693334 4.41334 1.73333 4.01334L9.73333 1.01334C10.3333 0.786673 11 0.786673 11.6 1.02667L19.6 4.02667C20.1083 4.21658 20.5465 4.55701 20.8562 5.00252C21.1659 5.44803 21.3324 5.97742 21.3333 6.52001V12.7867C21.3333 19.24 16.4 25.28 10.6666 27.12Z" fill="currentColor" fill-opacity="0.22"></path><path d="M10.0845 1.94967L10.0867 1.94881C10.4587 1.8083 10.8666 1.81036 11.2286 1.95515L11.2387 1.95919L11.2489 1.963L19.2489 4.963L19.25 4.96342C19.5677 5.08211 19.8416 5.29488 20.0351 5.57333C20.2285 5.85151 20.3326 6.18203 20.3333 6.52082C20.3333 6.52113 20.3333 6.52144 20.3333 6.52176L20.3333 12.7867C20.3333 18.6535 15.8922 24.2319 10.6666 26.0652C5.44153 24.2316 1 18.6409 1 12.7867V6.52001C1 5.82357 1.42893 5.20343 2.08883 4.94803L10.0845 1.94967Z" stroke="currentColor" stroke-opacity="0.30" stroke-width="2"></path></svg>
+							No virus
+						</div>
+				<div class="dark:text-gray-300 sm:ml-auto">1.19 kB</div></div>
+			<div class="relative min-h-[100px] rounded-b-lg border border-t-0 leading-tight dark:border-gray-800 dark:bg-gray-925">
+				<div class="py-4 px-4 sm:px-6 prose hf-sanitized hf-sanitized-aKOIK5UWHeZSqPRRfdqN9">
+	<!-- HTML_TAG_START --><p><a rel="noopener nofollow" href="https://arxiv.org/abs/2112.03857">"<strong>GLIP: Grounded Language-Image Pre-training. CVPR 2022, Best Paper Finalist</strong>"</a></p>
+<p>This is the HuggingFace Gradio Demo for GLIP. The model requires an image, and a text to be the inputs. The text input can either be a natural sentence description (grounding), or a simple concatenation of some random categories (object detection).</p>
+<p>The paper presents a grounded language-image pre-training (GLIP) model for learning object-level, language-aware, and semantic-rich visual representations. GLIP unifies object detection and phrase grounding for pre-training. The unification brings two benefits: 1) it allows GLIP to learn from both detection and grounding data to improve both tasks and bootstrap a good grounding model; 2) GLIP can leverage massive image-text pairs by generating grounding boxes in a self-training fashion, making the learned representation semantic-rich.</p>
+<p>Code: <a rel="noopener nofollow" href="https://github.com/microsoft/GLIP">https://github.com/microsoft/GLIP</a></p>
+<p><strong>News</strong>: We are also holding an ODinW challenge at <a rel="noopener nofollow" href="https://computer-vision-in-the-wild.github.io/eccv-2022/">the CV in the Wild Workshop @ ECCV 2022</a>. We hope our open-source code encourage the community to participate in this challenge!</p>
+<!-- HTML_TAG_END --></div></div></section></div></main>
+	</div>
+		<script>
+			import("/front/build/kube-745aab2/index.js");
+			window.moonSha = "kube-745aab2/";
+			window.hubConfig = JSON.parse(`{"features":{"signupDisabled":false},"sshGitUrl":"git@hf.co","moonHttpUrl":"https://huggingface.co","captchaApiKey":"bd5f2066-93dc-4bdd-a64b-a24646ca3859","captchaDisabledOnSignup":true,"datasetsServerPublicUrl":"https://datasets-server.huggingface.co","stripePublicKey":"pk_live_x2tdjFXBCvXo2FFmMybezpeM00J6gPCAAc","environment":"production","userAgent":"HuggingFace (production)"}`);
+		</script>
+		<!-- Stripe -->
+		<script>
+			if (["hf.co", "huggingface.co"].includes(window.location.hostname)) {
+				const script = document.createElement("script");
+				script.src = "https://js.stripe.com/v3/";
+				script.async = true;
+				document.head.appendChild(script);
+			}
+		</script>
+		<!-- Google analytics v4 -->
+		<script>
+			if (["hf.co", "huggingface.co"].includes(window.location.hostname)) {
+				const script = document.createElement("script");
+				script.src = "https://www.googletagmanager.com/gtag/js?id=G-8Q63TH4CSL";
+				script.async = true;
+				document.head.appendChild(script);
+				window.dataLayer = window.dataLayer || [];
+				function gtag() {
+					if (window.dataLayer !== undefined) {
+						window.dataLayer.push(arguments);
+					}
+				}
+				gtag("js", new Date());
+				gtag("config", "G-8Q63TH4CSL", { page_path: "/spaces/haotiz/glip-zeroshot-demo/blob/main/docs/intro.md" });
+				/// ^ See https://developers.google.com/analytics/devguides/collection/gtagjs/pages
+				gtag("consent", "default", { ad_storage: "denied", analytics_storage: "denied" });
+				/// ^ See https://developers.google.com/tag-platform/gtagjs/reference#consent
+				/// TODO: ask the user for their consent and update this with gtag('consent', 'update')
+			}
+		</script>
+	</body>
+</html>

maskrcnn_benchmark/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.

maskrcnn_benchmark/config/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+from .defaults import _C as cfg
+from .paths_catalog import try_to_find

maskrcnn_benchmark/config/defaults.py ADDED Viewed

	@@ -0,0 +1,982 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import os
+from yacs.config import CfgNode as CN
+# -----------------------------------------------------------------------------
+# Convention about Training / Test specific parameters
+# -----------------------------------------------------------------------------
+# Whenever an argument can be either used for training or for testing, the
+# corresponding name will be post-fixed by a _TRAIN for a training parameter,
+# or _TEST for a test-specific parameter.
+# For example, the number of images during training will be
+# IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be
+# IMAGES_PER_BATCH_TEST
+# -----------------------------------------------------------------------------
+# Config definition
+# -----------------------------------------------------------------------------
+_C = CN()
+_C.MODEL = CN()
+_C.MODEL.RPN_ONLY = False
+_C.MODEL.BOX_ON = True
+_C.MODEL.MASK_ON = False
+_C.MODEL.KEYPOINT_ON = False
+_C.MODEL.DEVICE = "cuda"
+_C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN"
+_C.MODEL.RPN_ARCHITECTURE = "RPN"
+_C.MODEL.DEBUG = False  # add debug flag
+_C.MODEL.ONNX = False  # add onnx flag
+# If the WEIGHT starts with a catalog://, like :R-50, the code will look for
+# the path in paths_catalog. Else, it will use it as the specified absolute
+# path
+_C.MODEL.WEIGHT = ""
+_C.MODEL.PRETRAIN_NAME = ""
+# If LINEAR_PROB = True, only the last linear layers in rpn and roi_head are trainable
+_C.MODEL.LINEAR_PROB = False
+# -----------------------------------------------------------------------------
+# Multitask Training / Test specific parameters
+# -----------------------------------------------------------------------------
+_C.MODEL.MULTITASK = CN(new_allowed=True)
+# -----------------------------------------------------------------------------
+# INPUT
+# -----------------------------------------------------------------------------
+_C.INPUT = CN()
+# Size of the smallest side of the image during training
+_C.INPUT.MIN_SIZE_TRAIN = 800  # (800,)
+# Maximum size of the side of the image during training
+_C.INPUT.MAX_SIZE_TRAIN = 1333
+# Size of the smallest side of the image during testing
+_C.INPUT.MIN_SIZE_TEST = 800
+# Maximum size of the side of the image during testing
+_C.INPUT.MAX_SIZE_TEST = 1333
+# Values to be used for image normalization
+_C.INPUT.PIXEL_MEAN = [102.9801, 115.9465, 122.7717]
+# Values to be used for image normalization
+_C.INPUT.PIXEL_STD = [1.0, 1.0, 1.0]
+# Convert image to BGR format (for Caffe2 models), in range 0-255
+_C.INPUT.TO_BGR255 = True
+_C.INPUT.FORMAT = ""
+_C.INPUT.FIX_RES = False
+# -----------------------------------------------------------------------------
+# Augmentation
+# -----------------------------------------------------------------------------
+_C.AUGMENT = CN()
+_C.AUGMENT.USE_RA = 0
+_C.AUGMENT.FLIP_PROB_TRAIN = 0.5
+_C.AUGMENT.VERTICAL_FLIP_PROB_TRAIN = 0.0
+_C.AUGMENT.MULT_MIN_SIZE_TRAIN = ()
+_C.AUGMENT.BRIGHTNESS = 0.0
+_C.AUGMENT.CONTRAST = 0.0
+_C.AUGMENT.SATURATION = 0.0
+_C.AUGMENT.HUE = 0.0
+_C.AUGMENT.CROP_PROB = 0.5
+_C.AUGMENT.CROP_MIN_IOUS = (0.1, 0.3, 0.5, 0.7, 0.9)
+_C.AUGMENT.CROP_MIN_SIZE = 0.3
+_C.AUGMENT.AFFINE_PROB = 0.5
+_C.AUGMENT.AFFINE_R = (-10, 10)
+_C.AUGMENT.AFFINE_T = (0.1, 0.1)
+_C.AUGMENT.AFFINE_S = (0.9, 1.1)
+_C.AUGMENT.AFFINE_SHEAR = (-2, 2)
+_C.AUGMENT.AFFINE_FILL = (127.5, 127.5, 127.5)
+_C.AUGMENT.ERASE_PROB = 0.0
+_C.AUGMENT.ERASE_L = 0.02
+_C.AUGMENT.ERASE_H = 1 / 3
+_C.AUGMENT.ERASE_MIN_ASPECT = 0.3
+_C.AUGMENT.ERASE_MODE = "const"
+_C.AUGMENT.ERASE_MAX_COUNT = 1
+_C.AUGMENT.ERASE_MAX_OVERLAP = 0.6
+_C.AUGMENT.ERASE_MAX_VALUE = 255
+_C.AUGMENT.MOSAIC_PROB = 0.0
+_C.AUGMENT.MOSAIC_SHIFT = 0.5
+_C.AUGMENT.MOSAIC_SIZE = -1
+_C.AUGMENT.PASTE_PROB = 0.0
+_C.AUGMENT.PASTE_CAT = ()
+_C.AUGMENT.PASTE_NUM = 2
+# -----------------------------------------------------------------------------
+# Dataset
+# -----------------------------------------------------------------------------
+_C.DATASETS = CN()
+# List of the dataset names for training, as present in paths_catalog.py
+_C.DATASETS.TRAIN = ()
+# List of the dataset names for testing, as present in paths_catalog.py
+_C.DATASETS.TEST = ()
+# Use is_crowd label
+_C.DATASETS.USE_CROWD = False
+_C.DATASETS.CLASS_AGNOSTIC = False
+_C.DATASETS.CLASS_CONCAT = False
+_C.DATASETS.MAX_BOX = -1
+_C.DATASETS.SAMPLE_RATIO = 0.0
+_C.DATASETS.FEW_SHOT = 0
+# SHUFFLE_SEED != 0 means shuffle the dataset in the few shot setting
+_C.DATASETS.SHUFFLE_SEED = 0
+_C.DATASETS.PREDEFINED_TEXT = ""
+_C.DATASETS.ALTERNATIVE_TRAINING = False
+_C.DATASETS.MULTISTAGE_TRAINING = False
+_C.DATASETS.REGISTER = CN(new_allowed=True)
+_C.DATASETS.BOX_THRESHOLD = 0.1
+# Duplicate Dataset
+_C.DATASETS.COCO_COPY = 1
+_C.DATASETS.LVIS_COPY = 1
+_C.DATASETS.FLICKR_COPY = 1
+_C.DATASETS.MIXED_COPY = 1
+_C.DATASETS.OBJECT365_COPY = 1
+_C.DATASETS.VG_COPY = 1
+_C.DATASETS.OI_COPY = 1
+_C.DATASETS.IN_COPY = 1
+_C.DATASETS.MIXED_GPT_COPY = 1
+# Duplicate Dataset
+_C.DATASETS.COCO_COPY = 1
+_C.DATASETS.FLICKR_COPY = 1
+_C.DATASETS.MIXED_COPY = 1
+_C.DATASETS.OBJECT365_COPY = 1
+_C.DATASETS.VG_COPY = 1
+_C.DATASETS.OI_COPY = 1
+_C.DATASETS.IN_COPY = 1
+_C.DATASETS.REFCOCO_COPY = 1
+_C.DATASETS.GENERAL_COPY = -1
+_C.DATASETS.GENERAL_COPY_TEST = -1
+# OD to Grounding
+_C.DATASETS.RANDOM_SAMPLE_NEG = -1
+_C.DATASETS.ADD_DET_PROMPT = False
+_C.DATASETS.ADD_DET_PROMPT_ADVANCED = False
+_C.DATASETS.USE_OD_AUG = False
+_C.DATASETS.USE_COCO_FORMAT = False
+_C.DATASETS.CONTROL_PROB = ()
+_C.DATASETS.DISABLE_SHUFFLE = False
+_C.DATASETS.PROMPT_VERSION = ""
+_C.DATASETS.PROMPT_LIMIT_NEG = -1
+_C.DATASETS.POS_QUESTION_PROB = 0.6
+_C.DATASETS.NEG_QUESTION_PROB = 0.8
+_C.DATASETS.FULL_QUESTION_PROB = 0.5
+_C.DATASETS.ONE_HOT = False
+_C.DATASETS.NO_MINUS_ONE_FOR_ONE_HOT = False
+_C.DATASETS.DISABLE_CLIP_TO_IMAGE = False
+_C.DATASETS.SEPARATION_TOKENS = " "
+# LVIS
+_C.DATASETS.LVIS_USE_NORMAL_AP = False
+_C.DATASETS.LVIS_TOPK = 10000
+_C.DATASETS.SPECIAL_SAFEGUARD_FOR_COCO_GROUNDING = False
+# Caption
+_C.DATASETS.BING_INDEX_LIST = []
+_C.DATASETS.CAPTION_MIN_BOX = 1
+_C.DATASETS.REPLACE_CLEAN_LABEL = False
+_C.DATASETS.FURTHER_SCREEN = False
+_C.DATASETS.CAPTION_CONF = 0.9
+_C.DATASETS.CAPTION_NMS = 0.9
+_C.DATASETS.PACK_RANDOM_CAPTION_NUMBER = 0
+_C.DATASETS.INFERENCE_CAPTION = False
+_C.DATASETS.SAMPLE_NEGATIVE_FOR_GROUNDING_DATA = -1.0
+_C.DATASETS.RANDOM_PACK_PROB = -1.0
+_C.DATASETS.NO_RANDOM_PACK_PROBABILITY = 0.0
+_C.DATASETS.SAFEGUARD_POSITIVE_CAPTION = True
+_C.DATASETS.CAPTION_FORMAT_VERSION = "v1"
+_C.DATASETS.LOCAL_DEBUG = False
+# Od in the wild
+_C.DATASETS.PREDEFINED_TEXT = None
+_C.DATASETS.TRAIN_DATASETNAME_SUFFIX = ""
+_C.DATASETS.TEST_DATASETNAME_SUFFIX = ""
+_C.DATASETS.OVERRIDE_CATEGORY = None
+_C.DATASETS.USE_OVERRIDE_CATEGORY = False
+_C.DATASETS.SUPRESS_QUERY = None
+_C.DATASETS.USE_SUPRESS_QUERY = False
+_C.DATASETS.USE_CAPTION_PROMPT = False
+_C.DATASETS.CAPTION_PROMPT = None
+_C.DATASETS.PREDOWNLOAD_BING = False
+_C.DATASETS.PREDOWNLOAD_WITH_AZCOPY = False
+_C.DATASETS.FLICKR_GT_TYPE = "separate"
+# PACO
+_C.DATASETS.OD_TO_GROUNDING_VERSION = "legacy"
+# description
+_C.DATASETS.DESCRIPTION_FILE = None
+_C.DATASETS.SIMILARITY_FILE = None
+_C.DATASETS.CAPTION_VOCAB_FILE = None
+# caption augmentation
+_C.DATASETS.CAPTION_AUGMENTATION_VOCAB = None
+_C.DATASETS.CAPTION_AUGMENTATION_VERSION = None
+_C.DATASETS.CC_CAPTION_AUGMENTATION_VERSION = None
+_C.DATASETS.KEEP_NOUN_RATIO = 0.0
+# VQA
+_C.DATASETS.DIVER_BOX_FOR_VQA = False
+# -----------------------------------------------------------------------------
+# DataLoader
+# -----------------------------------------------------------------------------
+_C.DATALOADER = CN()
+# Number of data loading threads
+_C.DATALOADER.NUM_WORKERS = 4
+# If > 0, this enforces that each collated batch should have a size divisible
+# by SIZE_DIVISIBILITY
+_C.DATALOADER.SIZE_DIVISIBILITY = 0
+# If True, each batch should contain only images for which the aspect ratio
+# is compatible. This groups portrait images together, and landscape images
+# are not batched with portrait images.
+_C.DATALOADER.ASPECT_RATIO_GROUPING = True
+# Define min number of keypoints required from GT, for example 10 out of 17
+_C.DATALOADER.MIN_KPS_PER_IMS = 0
+# Use random sampler during training
+_C.DATALOADER.USE_RANDOM_SEED = False
+_C.DATALOADER.DISTRIBUTE_CHUNK_AMONG_NODE = False
+# ---------------------------------------------------------------------------- #
+# Backbone options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.BACKBONE = CN()
+# The backbone conv body to use
+# The string must match a function that is imported in modeling.model_builder
+# (e.g., 'FPN.add_fpn_ResNet101_conv5_body' to specify a ResNet-101-FPN
+# backbone)
+_C.MODEL.BACKBONE.CONV_BODY = "R-50-C4"
+# Add StopGrad at a specified stage so the bottom layers are frozen
+_C.MODEL.BACKBONE.FREEZE_CONV_BODY_AT = 2
+_C.MODEL.BACKBONE.FREEZE = False
+_C.MODEL.BACKBONE.GROUP = 1
+_C.MODEL.BACKBONE.OUT_CHANNELS = 256 * 4
+# Option to reset bn running statics
+_C.MODEL.BACKBONE.RESET_BN = False
+# Backbone Normalization Level
+_C.MODEL.BACKBONE.NORM_LEVEL = 3
+# BN for backbone
+_C.MODEL.BACKBONE.USE_BN = False
+# Sync BN for backbone
+_C.MODEL.BACKBONE.USE_SYNCBN = False
+_C.MODEL.BACKBONE.USE_NSYNCBN = False
+# GN for backbone
+_C.MODEL.BACKBONE.USE_GN = False
+# Evo Norm for backbone
+_C.MODEL.BACKBONE.USE_EN = False
+# Layers for backbone
+_C.MODEL.BACKBONE.USE_DFCONV = False
+_C.MODEL.BACKBONE.USE_DYRELU = False
+_C.MODEL.BACKBONE.USE_SE = False
+_C.MODEL.BACKBONE.LAYER_SETUP = (3, 4, 6, 3)
+_C.MODEL.BACKBONE.LAYER_SEARCH = CN(new_allowed=True)
+_C.MODEL.BACKBONE.OUT_FEATURES = ("stage2", "stage3", "stage4", "stage5")
+_C.MODEL.BACKBONE.FPN_LAYER = ()
+_C.MODEL.BACKBONE.USE_CHECKPOINT = False
+# Add JF efficient det cfgs
+_C.MODEL.BACKBONE.EFFICIENT_DET_START_FROM = 3
+_C.MODEL.BACKBONE.EFFICIENT_DET_COMPOUND = 0
+_C.MODEL.BACKBONE.EFFICIENT_DET_BIFPN_VERSION = 0
+_C.MODEL.BACKBONE.FUSION_VERSION = "v1"  # Whether to use symmetric or non symmetric fusion
+_C.MODEL.LANGUAGE_BACKBONE = CN()
+_C.MODEL.LANGUAGE_BACKBONE.WEIGHT = ""
+_C.MODEL.LANGUAGE_BACKBONE.FREEZE = False
+_C.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT = False
+_C.MODEL.LANGUAGE_BACKBONE.TOKENIZER_TYPE = "bert-base-uncased"
+_C.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE = "bert-base-uncased"
+_C.MODEL.LANGUAGE_BACKBONE.LANG_DIM = 768
+_C.MODEL.LANGUAGE_BACKBONE.MAX_QUERY_LEN = 256
+_C.MODEL.LANGUAGE_BACKBONE.N_LAYERS = 1
+_C.MODEL.LANGUAGE_BACKBONE.UNUSED_TOKEN = 106
+_C.MODEL.LANGUAGE_BACKBONE.MASK_SPECIAL = False
+_C.MODEL.LANGUAGE_BACKBONE.RNN_TYPE = "lstm"
+_C.MODEL.LANGUAGE_BACKBONE.VARIABLE_LENGTH = True
+_C.MODEL.LANGUAGE_BACKBONE.WORD_EMBEDDING_SIZE = 512
+_C.MODEL.LANGUAGE_BACKBONE.WORD_VEC_SIZE = 512
+_C.MODEL.LANGUAGE_BACKBONE.HIDDEN_SIZE = 512
+_C.MODEL.LANGUAGE_BACKBONE.BIDIRECTIONAL = True
+_C.MODEL.LANGUAGE_BACKBONE.INPUT_DROPOUT_P = 0.5
+_C.MODEL.LANGUAGE_BACKBONE.DROPOUT_P = 0.2
+_C.MODEL.LANGUAGE_BACKBONE.CORPUS_PATH = ""
+_C.MODEL.LANGUAGE_BACKBONE.VOCAB_SIZE = 0
+_C.MODEL.LANGUAGE_BACKBONE.PAD_MAX = True
+# ---------------------------------------------------------------------------- #
+# FPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.FPN = CN()
+_C.MODEL.FPN.FREEZE = False
+_C.MODEL.FPN.USE_GN = False
+_C.MODEL.FPN.USE_RELU = False
+_C.MODEL.FPN.USE_DYRELU = False
+_C.MODEL.FPN.DROP_BLOCK = True
+_C.MODEL.FPN.DROP_PROB = 0.3
+_C.MODEL.FPN.DROP_SIZE = 3
+_C.MODEL.FPN.USE_SPP = False
+_C.MODEL.FPN.USE_PAN = False
+_C.MODEL.FPN.USE_DYHEAD = False
+_C.MODEL.FPN.RETURN_SWINT_FEATURE_BEFORE_FUSION = False
+# ---------------------------------------------------------------------------- #
+# BIFPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.BIFPN = CN()
+_C.MODEL.BIFPN.NUM_REPEATS = 1
+_C.MODEL.BIFPN.USE_ATTENTION = True
+# ---------------------------------------------------------------------------- #
+# Group Norm options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.GROUP_NORM = CN()
+# Number of dimensions per group in GroupNorm (-1 if using NUM_GROUPS)
+_C.MODEL.GROUP_NORM.DIM_PER_GP = -1
+# Number of groups in GroupNorm (-1 if using DIM_PER_GP)
+_C.MODEL.GROUP_NORM.NUM_GROUPS = 16
+# GroupNorm's small constant in the denominator
+_C.MODEL.GROUP_NORM.EPSILON = 1e-5
+# ---------------------------------------------------------------------------- #
+# Evo Norm options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.EVO_NORM = CN()
+# Number of groups in EvoNorm (-1 if using DIM_PER_GP)
+_C.MODEL.EVO_NORM.NUM_GROUPS = 8
+# EvoNorm's small constant in the denominator
+_C.MODEL.EVO_NORM.EPSILON = 1e-5
+# ---------------------------------------------------------------------------- #
+# RetinaNet Options (Follow the Detectron version)
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RETINANET = CN()
+# This is the number of foreground classes and background.
+_C.MODEL.RETINANET.NUM_CLASSES = 81
+# Convolutions to use in the cls and bbox tower
+# NOTE: this doesn't include the last conv for logits
+_C.MODEL.RETINANET.NUM_CONVS = 4
+# During inference, #locs to select based on cls score before NMS is performed
+# per FPN level
+_C.MODEL.RETINANET.PRE_NMS_TOP_N = 1000
+# Prior prob for the positives at the beginning of training. This is used to set
+# the bias init for the logits layer
+_C.MODEL.RETINANET.PRIOR_PROB = 0.01
+# Inference cls score threshold, anchors with score > INFERENCE_TH are
+# considered for inference
+_C.MODEL.RETINANET.INFERENCE_TH = 0.05
+# NMS threshold used in RetinaNet
+_C.MODEL.RETINANET.NMS_TH = 0.4
+_C.MODEL.RETINANET.DETECTIONS_PER_IMG = 100
+# ---------------------------------------------------------------------------- #
+# Focal Loss Options (Follow the Detectron version)
+# ---------------------------------------------------------------------------- #
+_C.MODEL.FOCAL = CN()
+# Weight for bbox_regression loss
+_C.MODEL.FOCAL.BBOX_REG_WEIGHT = 4.0
+# Smooth L1 loss beta for bbox regression
+_C.MODEL.FOCAL.BBOX_REG_BETA = 0.11
+# IoU overlap ratio for labeling an anchor as positive
+# Anchors with >= iou overlap are labeled positive
+_C.MODEL.FOCAL.FG_IOU_THRESHOLD = 0.5
+# IoU overlap ratio for labeling an anchor as negative
+# Anchors with < iou overlap are labeled negative
+_C.MODEL.FOCAL.BG_IOU_THRESHOLD = 0.4
+# Focal loss parameter: alpha
+_C.MODEL.FOCAL.LOSS_ALPHA = 0.25
+# Focal loss parameter: gamma
+_C.MODEL.FOCAL.LOSS_GAMMA = 2.0
+# ---------------------------------------------------------------------------- #
+# FCOS Options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.FCOS = CN()
+_C.MODEL.FCOS.NUM_CLASSES = 81  # the number of classes including background
+_C.MODEL.FCOS.FPN_STRIDES = [8, 16, 32, 64, 128]
+_C.MODEL.FCOS.PRIOR_PROB = 0.01
+_C.MODEL.FCOS.INFERENCE_TH = 0.05
+_C.MODEL.FCOS.NMS_TH = 0.6
+_C.MODEL.FCOS.PRE_NMS_TOP_N = 1000
+# the number of convolutions used in the cls and bbox tower
+_C.MODEL.FCOS.NUM_CONVS = 4
+# if use deformable conv to align features
+_C.MODEL.FCOS.USE_DFCONV = False
+# if CENTER_SAMPLING_RADIUS <= 0, it will disable center sampling
+_C.MODEL.FCOS.CENTER_SAMPLING_RADIUS = 0.0
+# IOU_LOSS_TYPE can be "iou", "linear_iou" or "giou"
+_C.MODEL.FCOS.IOU_LOSS_TYPE = "iou"
+_C.MODEL.FCOS.NORM_REG_TARGETS = False
+_C.MODEL.FCOS.CENTERNESS_ON_REG = False
+_C.MODEL.FCOS.USE_GT_CENTER = False
+_C.MODEL.FCOS.DETECTIONS_PER_IMG = 100
+_C.MODEL.FCOS.USE_GN = False
+_C.MODEL.FCOS.USE_BN = False
+_C.MODEL.FCOS.INFERENCE_TH_TRAIN = 0.0
+_C.MODEL.FCOS.PRE_NMS_TOP_N_TRAIN = 3000
+_C.MODEL.FCOS.POST_NMS_TOP_N_TRAIN = 1000
+# ---------------------------------------------------------------------------- #
+# ATSS Options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ATSS = CN()
+_C.MODEL.ATSS.NUM_CLASSES = 81  # the number of classes including background
+_C.MODEL.ATSS.PRIOR_PROB = 0.01
+_C.MODEL.ATSS.INFERENCE_TH = 0.05
+_C.MODEL.ATSS.NMS_TH = 0.6
+_C.MODEL.ATSS.PRE_NMS_TOP_N = 1000
+# the number of convolutions used in the cls and bbox tower
+_C.MODEL.ATSS.NUM_CONVS = 4
+# the channels of convolutions used in the cls and bbox tower
+_C.MODEL.ATSS.CHANNELS = 128
+# if use deformable conv to align features
+_C.MODEL.ATSS.USE_DFCONV = False
+# topk for selecting candidate positive samples from each level
+_C.MODEL.ATSS.TOPK = 9
+# Weight for bbox_regression loss
+_C.MODEL.ATSS.REG_LOSS_WEIGHT = 2.0
+_C.MODEL.ATSS.DETECTIONS_PER_IMG = 100
+_C.MODEL.ATSS.USE_GN = False
+_C.MODEL.ATSS.USE_BN = False
+_C.MODEL.ATSS.USE_DYRELU = False
+_C.MODEL.ATSS.USE_SE = False
+_C.MODEL.ATSS.INFERENCE_TH_TRAIN = 0.0
+_C.MODEL.ATSS.PRE_NMS_TOP_N_TRAIN = 3000
+_C.MODEL.ATSS.POST_NMS_TOP_N_TRAIN = 1000
+# ---------------------------------------------------------------------------- #
+# DYHEAD Options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.DYHEAD = CN()
+_C.MODEL.DYHEAD.NUM_CLASSES = 81  # the number of classes including background
+_C.MODEL.DYHEAD.PRIOR_PROB = 0.01
+# the number of convolutions used in the cls and bbox tower
+_C.MODEL.DYHEAD.NUM_CONVS = 4
+# the channels of convolutions used in the cls and bbox tower
+_C.MODEL.DYHEAD.CHANNELS = 128
+_C.MODEL.DYHEAD.GROUPS = 1
+# if use deformable conv to align features
+_C.MODEL.DYHEAD.USE_DFCONV = False
+# topk for selecting candidate positive samples from each level
+_C.MODEL.DYHEAD.TOPK = 9
+_C.MODEL.DYHEAD.SCORE_AGG = "MEAN"  # MEAN or MAX, for binary focal loss score aggregation
+_C.MODEL.DYHEAD.LOG_SCALE = 0.0  # temperature (dot product)
+_C.MODEL.DYHEAD.SHALLOW_LOG_SCALE = 0.0  # # temperature (shallow contrastive)
+_C.MODEL.DYHEAD.USE_GN = False
+_C.MODEL.DYHEAD.USE_NSYNCBN = False
+_C.MODEL.DYHEAD.USE_SYNCBN = False
+_C.MODEL.DYHEAD.USE_DYFUSE = False
+_C.MODEL.DYHEAD.USE_DYRELU = False
+_C.MODEL.DYHEAD.CONV_FUNC = ""
+# CosineSimOutputLayers: https://github.com/ucbdrive/few-shot-object-detection/blob/master/fsdet/modeling/roi_heads/fast_rcnn.py#L448-L464
+_C.MODEL.DYHEAD.COSINE_SCALE = -1.0
+_C.MODEL.DYHEAD.FUSE_CONFIG = CN()
+_C.MODEL.DYHEAD.FUSE_CONFIG.EARLY_FUSE_ON = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.TYPE = ""
+_C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_EMB_SIZE = 256
+_C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_OUT_SIZE = 256
+_C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_EMB_DROPOUT = 0.1
+_C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_MLP_LAYERS = 2
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_CLASSIFICATION_LOSS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_TOKEN_LOSS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.TOKEN_LOSS_WEIGHT = 1.0
+_C.MODEL.DYHEAD.FUSE_CONFIG.TOKEN_GAMMA = 2.0
+_C.MODEL.DYHEAD.FUSE_CONFIG.TOKEN_ALPHA = 0.25
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_DOT_PRODUCT_TOKEN_LOSS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_CONTRASTIVE_ALIGN_LOSS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.CONTRASTIVE_HIDDEN_DIM = 64
+_C.MODEL.DYHEAD.FUSE_CONFIG.CONTRASTIVE_ALIGN_LOSS_WEIGHT = 1.0
+_C.MODEL.DYHEAD.FUSE_CONFIG.DOT_PRODUCT_TOKEN_LOSS_WEIGHT = 1.0
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_LAYER_SCALE = True
+_C.MODEL.DYHEAD.FUSE_CONFIG.SEPARATE_BIDIRECTIONAL = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.STABLE_SOFTMAX_2D = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.DO_LANG_PROJ_OUTSIDE_CHECKPOINT = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_FUSED_FEATURES_DOT_PRODUCT = False
+# Controls for
+_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_MIN_FOR_UNDERFLOW = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_MAX_FOR_OVERFLOW = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_BERTATTN_MIN_FOR_UNDERFLOW = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_BERTATTN_MAX_FOR_OVERFLOW = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_DOT_PRODUCT = False
+# MLM Loss
+_C.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS_FOR_ONLY_POSITIVES = True
+_C.MODEL.DYHEAD.FUSE_CONFIG.NO_MASK_FOR_OD = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.NO_MASK_FOR_GOLD = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS_COEF = 1.0
+_C.MODEL.DYHEAD.FUSE_CONFIG.MLM_OBJ_FOR_ONLY_POSITIVE = False
+# Shallow Contrastive Loss (FPN)
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_SHALLOW_CONTRASTIVE_LOSS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.SHALLOW_MAX_POSITIVE_ANCHORS = 100
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_SHALLOW_ZERO_PADS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.SHALLOW_CONTRASTIVE_HIDDEN_DIM = 64
+_C.MODEL.DYHEAD.FUSE_CONFIG.SHALLOW_CONTRASTIVE_LOSS_WEIGHT = 1.0
+# Span Loss
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_SPAN_LOSS = False # will reuse the green light span field to indicate span boundary
+_C.MODEL.DYHEAD.FUSE_CONFIG.SPAN_VERSION = None
+_C.MODEL.DYHEAD.FUSE_CONFIG.MUTE_NOOBJ_TOKEN = False
+# Shallow Contrastive Loss (BACKBONE)
+_C.MODEL.DYHEAD.FUSE_CONFIG.USE_BACKBONE_SHALLOW_CONTRASTIVE_LOSS = False
+_C.MODEL.DYHEAD.FUSE_CONFIG.ADD_LINEAR_LAYER = False
+# Mute non-essential tokens
+_C.MODEL.DYHEAD.FUSE_CONFIG.MUTE_NON_ESSENTIAL_TOKENS = False
+# use checkpoint to save memory
+_C.MODEL.DYHEAD.USE_CHECKPOINT = False
+# ---------------------------------------------------------------------------- #
+# DYDETR Options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.DYDETR = CN()
+_C.MODEL.DYDETR.NHEADS = 8
+_C.MODEL.DYDETR.DROPOUT = 0.0
+_C.MODEL.DYDETR.DIM_FEEDFORWARD = 2048
+_C.MODEL.DYDETR.ACTIVATION = "relu"
+_C.MODEL.DYDETR.HIDDEN_DIM = 256
+_C.MODEL.DYDETR.NUM_CLS = 1
+_C.MODEL.DYDETR.NUM_REG = 3
+_C.MODEL.DYDETR.NUM_HEADS = 6
+_C.MODEL.DYDETR.NUM_CLASSES = 81
+_C.MODEL.DYDETR.NUM_PROPOSALS = 300
+# Dynamic Conv.
+_C.MODEL.DYDETR.NUM_DYNAMIC = 2
+_C.MODEL.DYDETR.DIM_DYNAMIC = 64
+# Loss.
+_C.MODEL.DYDETR.CLASS_WEIGHT = 2.0
+_C.MODEL.DYDETR.GIOU_WEIGHT = 2.0
+_C.MODEL.DYDETR.L1_WEIGHT = 5.0
+_C.MODEL.DYDETR.DEEP_SUPERVISION = True
+_C.MODEL.DYDETR.NO_OBJECT_WEIGHT = 0.1
+# Focal Loss.
+_C.MODEL.DYDETR.USE_FOCAL = True
+_C.MODEL.DYDETR.ALPHA = 0.25
+_C.MODEL.DYDETR.GAMMA = 2.0
+_C.MODEL.DYDETR.PRIOR_PROB = 0.01
+_C.MODEL.DYDETR.APPEND_BOX = False
+# GROUNDING RELATED
+_C.MODEL.DYDETR.INCLUDE_LANGUAGE_DECODER = False
+_C.MODEL.DYDETR.USE_DOT_PRODUCT_TOKEN_LOSS = False
+_C.MODEL.DYDETR.LOG_SCALE = 0.0  # temperature
+_C.MODEL.DYDETR.RESET_PARAMETERS = True
+_C.MODEL.DYDETR.USE_GROUNDING_MATCHER_SETCRITERION = False
+_C.MODEL.DYDETR.MDETR_PLAIN_INFERENCE = False
+_C.MODEL.DYDETR.OVERRIDE_LANGUAGE_MODEL_FOR_TOKEN_LOSS = False
+_C.MODEL.DYDETR.NORMALIZE_PER_BOX = False
+_C.MODEL.DYDETR.RESET_SKIP_DOT_PRODUCT_WEIGHTS = False
+_C.MODEL.DYDETR.DEBUG = False
+_C.MODEL.DYDETR.AGGREGATE_METHOD = "MEAN"
+_C.MODEL.DYDETR.EARLY_FUSE_ON = False
+_C.MODEL.DYDETR.DYTOWER_ON = False
+_C.MODEL.DYDETR.USE_FUSED_LANGUAGE_FEATURES = True
+# ---------------------------------------------------------------------------- #
+# RPN options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RPN = CN()
+_C.MODEL.RPN.USE_FPN = False
+# Base RPN anchor sizes given in absolute pixels w.r.t. the scaled network input
+_C.MODEL.RPN.ANCHOR_SIZES = (32, 64, 128, 256, 512)
+# Stride of the feature map that RPN is attached.
+# For FPN, number of strides should match number of scales
+_C.MODEL.RPN.ANCHOR_STRIDE = (16,)
+# RPN anchor aspect ratios
+_C.MODEL.RPN.ASPECT_RATIOS = (0.5, 1.0, 2.0)
+# Anchor shift away ration from the center for r,t,l,d
+_C.MODEL.RPN.ANCHOR_SHIFT = (0.0, 0.0, 0.0, 0.0)
+# Use center to decide anchor size
+_C.MODEL.RPN.USE_RELATIVE_SIZE = False
+# Remove RPN anchors that go outside the image by RPN_STRADDLE_THRESH pixels
+# Set to -1 or a large value, e.g. 100000, to disable pruning anchors
+_C.MODEL.RPN.STRADDLE_THRESH = 0
+# Anchor scales per octave for complex anchors
+_C.MODEL.RPN.OCTAVE = 2.0
+_C.MODEL.RPN.SCALES_PER_OCTAVE = 3
+# Minimum overlap required between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD
+# ==> positive RPN example)
+_C.MODEL.RPN.FG_IOU_THRESHOLD = 0.7
+# Maximum overlap allowed between an anchor and ground-truth box for the
+# (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD
+# ==> negative RPN example)
+_C.MODEL.RPN.BG_IOU_THRESHOLD = 0.3
+# Total number of RPN examples per image
+_C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256
+# Target fraction of foreground (positive) examples per RPN minibatch
+_C.MODEL.RPN.POSITIVE_FRACTION = 0.5
+# Number of top scoring RPN proposals to keep before applying NMS
+# When FPN is used, this is *per FPN level* (not total)
+_C.MODEL.RPN.PRE_NMS_TOP_N_TRAIN = 12000
+_C.MODEL.RPN.PRE_NMS_TOP_N_TEST = 6000
+# Number of top scoring RPN proposals to keep after applying NMS
+_C.MODEL.RPN.POST_NMS_TOP_N_TRAIN = 2000
+_C.MODEL.RPN.POST_NMS_TOP_N_TEST = 1000
+# NMS threshold used on RPN proposals
+_C.MODEL.RPN.NMS_THRESH = 0.7
+# Proposal height and width both need to be greater than RPN_MIN_SIZE
+# (a the scale used during training or inference)
+_C.MODEL.RPN.MIN_SIZE = 0
+# Number of top scoring RPN proposals to keep after combining proposals from
+# all FPN levels
+_C.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN = 2000
+_C.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 2000
+# Custom rpn head, empty to use default conv or separable conv
+_C.MODEL.RPN.RPN_HEAD = "SingleConvRPNHead"
+_C.MODEL.RPN.FREEZE = False
+_C.MODEL.RPN.FORCE_BOXES = False
+_C.MODEL.RPN.RETURN_FUSED_FEATURES = False
+# ---------------------------------------------------------------------------- #
+# ROI HEADS options
+# ---------------------------------------------------------------------------- #
+_C.MODEL.ROI_HEADS = CN()
+_C.MODEL.ROI_HEADS.USE_FPN = False
+# Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD)
+_C.MODEL.ROI_HEADS.FG_IOU_THRESHOLD = 0.5
+# Overlap threshold for an RoI to be considered background
+# (class = 0 if overlap in [0, BG_IOU_THRESHOLD))
+_C.MODEL.ROI_HEADS.BG_IOU_THRESHOLD = 0.5
+# Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets
+# These are empirically chosen to approximately lead to unit variance targets
+_C.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS = (10.0, 10.0, 5.0, 5.0)
+# RoI minibatch size *per image* (number of regions of interest [ROIs])
+# Total number of RoIs per training minibatch =
+#   TRAIN.BATCH_SIZE_PER_IM * TRAIN.IMS_PER_BATCH * NUM_GPUS
+# E.g., a common configuration is: 512 * 2 * 8 = 8192
+_C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
+# Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0)
+_C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25
+# Only used on test mode
+# Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to
+# balance obtaining high recall with not having too many low precision
+# detections that will slow down inference post processing steps (like NMS)
+_C.MODEL.ROI_HEADS.SCORE_THRESH = 0.05
+# Overlap threshold used for non-maximum suppression (suppress boxes with
+# IoU >= this threshold)
+_C.MODEL.ROI_HEADS.NMS = 0.5
+# Maximum number of detections to return per image (100 is based on the limit
+# established for the COCO dataset)
+_C.MODEL.ROI_HEADS.DETECTIONS_PER_IMG = 100
+_C.MODEL.ROI_BOX_HEAD = CN()
+_C.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor"
+_C.MODEL.ROI_BOX_HEAD.PREDICTOR = "FastRCNNPredictor"
+_C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_BOX_HEAD.POOLER_SCALES = (1.0 / 16,)
+_C.MODEL.ROI_BOX_HEAD.NUM_CLASSES = 81
+# Hidden layer dimension when using an MLP for the RoI box head
+_C.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM = 1024
+# GN
+_C.MODEL.ROI_BOX_HEAD.USE_GN = False
+# Dilation
+_C.MODEL.ROI_BOX_HEAD.DILATION = 1
+_C.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM = 256
+_C.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS = 4
+# Use D2 style ROIAlignV2
+_C.MODEL.ROI_BOX_HEAD.POOLER_ALIGNED = False
+_C.MODEL.ROI_MASK_HEAD = CN()
+_C.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor"
+_C.MODEL.ROI_MASK_HEAD.PREDICTOR = "MaskRCNNC4Predictor"
+_C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_MASK_HEAD.POOLER_SCALES = (1.0 / 16,)
+_C.MODEL.ROI_MASK_HEAD.MLP_HEAD_DIM = 1024
+_C.MODEL.ROI_MASK_HEAD.CONV_LAYERS = (256, 256, 256, 256)
+_C.MODEL.ROI_MASK_HEAD.RESOLUTION = 14
+_C.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True
+# Whether or not resize and translate masks to the input image.
+_C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS = False
+_C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD = 0.5
+# Dilation
+_C.MODEL.ROI_MASK_HEAD.DILATION = 1
+# GN
+_C.MODEL.ROI_MASK_HEAD.USE_GN = False
+# HG
+_C.MODEL.ROI_MASK_HEAD.HG_SCALE = 1
+_C.MODEL.ROI_KEYPOINT_HEAD = CN()
+_C.MODEL.ROI_KEYPOINT_HEAD.FEATURE_EXTRACTOR = "KeypointRCNNFeatureExtractor"
+_C.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR = "KeypointRCNNPredictor"
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0
+_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SCALES = (1.0 / 16,)
+_C.MODEL.ROI_KEYPOINT_HEAD.MLP_HEAD_DIM = 1024
+_C.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS = tuple(512 for _ in range(8))
+_C.MODEL.ROI_KEYPOINT_HEAD.RESOLUTION = 14
+_C.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES = 17
+_C.MODEL.ROI_KEYPOINT_HEAD.KEYPOINT_NAME = ()  # If left empty, use default names
+_C.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True
+# ---------------------------------------------------------------------------- #
+# ResNe[X]t options (ResNets = {ResNet, ResNeXt}
+# Note that parts of a resnet may be used for both the backbone and the head
+# These options apply to both
+# ---------------------------------------------------------------------------- #
+_C.MODEL.RESNETS = CN()
+_C.MODEL.RESNETS.USE_STEM3X3 = False
+_C.MODEL.RESNETS.WITH_SE = False
+_C.MODEL.RESNETS.USE_AVG_DOWN = False
+# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
+_C.MODEL.RESNETS.NUM_GROUPS = 1
+# Baseline width of each group
+_C.MODEL.RESNETS.WIDTH_PER_GROUP = 64
+# Place the stride 2 conv on the 1x1 filter
+# Use True only for the original MSRA ResNet; use False for C2 and Torch models
+_C.MODEL.RESNETS.STRIDE_IN_1X1 = True
+# Residual transformation function
+_C.MODEL.RESNETS.TRANS_FUNC = "BottleneckWithFixedBatchNorm"
+# ResNet's stem function (conv1 and pool1)
+_C.MODEL.RESNETS.STEM_FUNC = "StemWithFixedBatchNorm"
+# Apply dilation in stage "res5"
+_C.MODEL.RESNETS.RES5_DILATION = 1
+_C.MODEL.RESNETS.BACKBONE_OUT_CHANNELS = 256 * 4
+_C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
+_C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64
+_C.MODEL.RESNETS.REVISION = "resnet_light"
+# Deformable convolutions
+_C.MODEL.RESNETS.STAGE_WITH_DCN = (False, False, False, False)
+_C.MODEL.RESNETS.WITH_MODULATED_DCN = False
+_C.MODEL.RESNETS.DEFORMABLE_GROUPS = 1
+# ---------------------------------------------------------------------------- #
+# Swin Transformer
+# ---------------------------------------------------------------------------- #
+_C.MODEL.SWINT = CN()
+_C.MODEL.SWINT.EMBED_DIM = 96
+_C.MODEL.SWINT.OUT_CHANNELS = (96, 192, 384, 768)
+_C.MODEL.SWINT.DEPTHS = (2, 2, 6, 2)
+_C.MODEL.SWINT.NUM_HEADS = (3, 6, 12, 24)
+_C.MODEL.SWINT.WINDOW_SIZE = 7
+_C.MODEL.SWINT.MLP_RATIO = 4
+_C.MODEL.SWINT.DROP_PATH_RATE = 0.2
+_C.MODEL.SWINT.APE = False
+_C.MODEL.SWINT.VERSION = "v1"
+_C.MODEL.SWINT.OUT_NORM = True
+_C.MODEL.SWINT.LAYER_SCALE = 0
+# ---------------------------------------------------------------------------- #
+# CVT SPEC
+# ---------------------------------------------------------------------------- #
+_C.MODEL.SPEC = CN(new_allowed=True)
+# ---------------------------------------------------------------------------- #
+# CLIP SPEC
+# ---------------------------------------------------------------------------- #
+_C.MODEL.CLIP = CN()
+_C.MODEL.CLIP.CONTEXT_LENGTH = 256  # default 77
+_C.MODEL.CLIP.WIDTH = 512
+_C.MODEL.CLIP.LAYERS = 12
+_C.MODEL.CLIP.HEADS = 8
+_C.MODEL.CLIP.DROP_PATH = 0.0
+_C.MODEL.CLIP.TOKENIZER = "clip"
+_C.MODEL.CLIP.VOCAB_SIZE = 49408
+# ---------------------------------------------------------------------------- #
+# SEARCH
+# ---------------------------------------------------------------------------- #
+_C.SEARCH = CN()
+_C.SEARCH.MAX_EPOCH = 20
+_C.SEARCH.SELECT_NUM = 20
+_C.SEARCH.POPULATION_NUM = 64
+_C.SEARCH.MUTATION_NUM = 24
+_C.SEARCH.CROSSOVER_NUM = 24
+_C.SEARCH.MUTATION_PROB = 0.1
+# ---------------------------------------------------------------------------- #
+# Solver
+# ---------------------------------------------------------------------------- #
+_C.SOLVER = CN()
+_C.SOLVER.USE_AMP = False
+_C.SOLVER.MAX_ITER = 40000
+_C.SOLVER.MULTI_MAX_ITER = ()  # set different max epoch for different stage
+_C.SOLVER.MAX_EPOCH = 0  # any epoch number>0 will overwrite max_iter
+_C.SOLVER.MULTI_MAX_EPOCH = ()  # set different max epoch for different stage
+_C.SOLVER.OPTIMIZER = "SGD"  # "ADAMW"
+_C.SOLVER.BASE_LR = 0.001
+_C.SOLVER.LANG_LR = 0.00001
+_C.SOLVER.BACKBONE_BODY_LR_FACTOR = 1.0
+_C.SOLVER.FUSION_LR_FACTOR = 1.0
+_C.SOLVER.BIAS_LR_FACTOR = 2
+_C.SOLVER.GRAD_CLIP = 0.0
+# D2 gradient clip
+_C.SOLVER.CLIP_GRADIENTS = CN()
+_C.SOLVER.CLIP_GRADIENTS.ENABLED = False
+_C.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 0.0
+_C.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "full_model"
+_C.SOLVER.CLIP_GRADIENTS.NORM_TYPE = 2.0
+_C.SOLVER.MODEL_EMA = 0.0
+_C.SOLVER.MOMENTUM = 0.9
+_C.SOLVER.WEIGHT_DECAY = 0.0005
+_C.SOLVER.WEIGHT_DECAY_BIAS = 0.0
+_C.SOLVER.WEIGHT_DECAY_NORM_FACTOR = 1.0
+_C.SOLVER.WEIGHT_DECAY_HEAD_FACTOR = 1.0
+# use cosine lr to replace default multistage
+_C.SOLVER.USE_COSINE = False
+_C.SOLVER.MIN_LR = 0.000001
+_C.SOLVER.GAMMA = 0.1
+_C.SOLVER.STEPS = (30000,)
+_C.SOLVER.USE_AUTOSTEP = False
+_C.SOLVER.STEP_PATIENCE = 5
+_C.SOLVER.WARMUP_FACTOR = 1.0 / 3
+_C.SOLVER.WARMUP_ITERS = 500
+_C.SOLVER.WARMUP_METHOD = "linear"
+_C.SOLVER.CHECKPOINT_PERIOD = 2500
+_C.SOLVER.CHECKPOINT_PER_EPOCH = -1.0
+_C.SOLVER.TEST_WITH_INFERENCE = False
+_C.SOLVER.AUTO_TERMINATE_PATIENCE = -1
+# Number of images per batch
+# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
+# see 2 images per batch
+_C.SOLVER.IMS_PER_BATCH = 16
+# This is the max negative ratio allowed per batch
+_C.SOLVER.MAX_NEG_PER_BATCH = 0.1
+_C.SOLVER.SEED = 0
+_C.SOLVER.DISABLE_OUTPUT_DISTRIBUTED = False
+_C.SOLVER.PROMPT_PROBING_LEVEL = -1.0
+# -1 means tuning the whole model;
+# 1 means tuning the whole language model; 1.5 means tuning the box head as well
+_C.SOLVER.FIND_UNUSED_PARAMETERS = True
+_C.SOLVER.DATASET_LENGTH = -1  # Just for logging purpose
+_C.SOLVER.TUNING_HIGHLEVEL_OVERRIDE = None
+_C.SOLVER.USE_EMA_FOR_MONITOR = False
+_C.SOLVER.WEIGHT_DECAY_SCHEDULE = False
+_C.SOLVER.WEIGHT_DECAY_SCHEDULE_RATIO = 0.667
+_C.SOLVER.RESUME_SKIP_SCHEDULE = False # when we resume from a checkpoint, we can skip
+# ---------------------------------------------------------------------------- #
+# Specific test options
+# ---------------------------------------------------------------------------- #
+_C.TEST = CN()
+_C.TEST.EXPECTED_RESULTS = []
+_C.TEST.EXPECTED_RESULTS_SIGMA_TOL = 4
+_C.TEST.DURING_TRAINING = False
+# Number of images per batch
+# This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
+# see 2 images per batch
+_C.TEST.IMS_PER_BATCH = 16
+# Special Test Configuration
+_C.TEST.USE_MULTISCALE = False
+# _C.TEST.SCALES = (400, 600, 800, 1000, 1200, 1400)
+# _C.TEST.RANGES = ((96, 10000), (64, 10000), (0, 10000), (0, 10000), (0, 256), (0, 192))
+_C.TEST.SCALES = (400, 500, 600, 640, 700, 900, 1000, 1100, 1200, 1300, 1400, 1800)
+_C.TEST.RANGES = (
+    (96, 10000),
+    (96, 10000),
+    (64, 10000),
+    (64, 10000),
+    (64, 10000),
+    (0, 10000),
+    (0, 10000),
+    (0, 256),
+    (0, 256),
+    (0, 192),
+    (0, 192),
+    (0, 96),
+)
+_C.TEST.MAX_SIZE = 2500
+_C.TEST.FLIP = True
+_C.TEST.SPECIAL_NMS = "none"  # ('none', 'soft-nms', 'vote', 'soft-vote')
+_C.TEST.TH = 0.6  # threshold for nms or vote
+_C.TEST.PRE_NMS_TOP_N = 1000
+_C.TEST.NUM_CLASSES = 81
+_C.TEST.SELECT_CLASSES = ()
+_C.TEST.EVAL_TASK = ""
+_C.TEST.SUBSET = -1
+_C.TEST.CHUNKED_EVALUATION = -1
+_C.TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM = -1
+_C.TEST.CHUNK_METHOD = "random" # or similar
+_C.TEST.CHUNK_INFERENCE_VERSION = "v1" # v2: modify the ATSS inference code slightly to make
+# ---------------------------------------------------------------------------- #
+# Misc options
+# ---------------------------------------------------------------------------- #
+_C.OUTPUT_DIR = "OUTPUT"
+_C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py")
+# TensorBoard experiment location
+_C.TENSORBOARD_EXP = "OUTPUT"
+_C.GLIPKNOW = CN()
+_C.GLIPKNOW.KNOWLEDGE_FILE = ""
+_C.GLIPKNOW.KNOWLEDGE_TYPE = ""
+_C.GLIPKNOW.MAX_NUM_CLASSES_PER_BATCH_TRAIN = -1
+_C.GLIPKNOW.PARALLEL_LANGUAGE_INPUT = False
+_C.GLIPKNOW.LAN_FEATURE_AGG_TYPE = "first"
+_C.GLIPKNOW.GPT3_NUM = 5
+_C.GLIPKNOW.WIKI_AND_GPT3 = False

maskrcnn_benchmark/config/paths_catalog.py ADDED Viewed

	@@ -0,0 +1,779 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+"""Centralized catalog of paths."""
+import os
+def try_to_find(file, return_dir=False, search_path=["./DATASET", "./OUTPUT", "./data", "./MODEL"]):
+    if not file:
+        return file
+    if file.startswith("catalog://"):
+        return file
+    DATASET_PATH = ["./"]
+    if "DATASET" in os.environ:
+        DATASET_PATH.append(os.environ["DATASET"])
+    DATASET_PATH += search_path
+    for path in DATASET_PATH:
+        if os.path.exists(os.path.join(path, file)):
+            if return_dir:
+                return path
+            else:
+                return os.path.join(path, file)
+    print("Cannot find {} in {}".format(file, DATASET_PATH))
+    exit(1)
+class DatasetCatalog(object):
+    DATASETS = {
+        # pretrained grounding dataset
+        # mixed vg and coco
+        "mixed_train": {
+            "coco_img_dir": "coco/train2014",
+            "vg_img_dir": "gqa/images",
+            "ann_file": "mdetr_annotations/final_mixed_train.json",
+        },
+        "mixed_train_no_coco": {
+            "coco_img_dir": "coco/train2014",
+            "vg_img_dir": "gqa/images",
+            "ann_file": "mdetr_annotations/final_mixed_train_no_coco.json",
+        },
+        # flickr30k
+        "flickr30k_train": {
+            "img_folder": "flickr30k/flickr30k_images/train",
+            "ann_file": "mdetr_annotations/final_flickr_separateGT_train.json",
+            "is_train": True,
+        },
+        "flickr30k_val": {
+            "img_folder": "flickr30k/flickr30k_images/val",
+            "ann_file": "mdetr_annotations/final_flickr_separateGT_val.json",
+            "is_train": False,
+        },
+        "flickr30k_test": {
+            "img_folder": "flickr30k/flickr30k_images/test",
+            "ann_file": "mdetr_annotations/final_flickr_separateGT_test.json",
+            "is_train": False,
+        },
+        # refcoco
+        "refexp_all_val": {
+            "img_dir": "coco/train2014",
+            "ann_file": "mdetr_annotations/final_refexp_val.json",
+            "is_train": False,
+        },
+        "refcoco_train": {
+            "img_dir": "coco/train2014",
+            "ann_file": "mdetr_annotations/finetune_refcoco_train.json",
+            "is_train": True,
+        },
+        "refcoco_val": {
+            "img_dir": "coco/train2014",
+            "ann_file": "mdetr_annotations/finetune_refcoco_val.json",
+            "is_train": False,
+        },
+        "refcoco_real_val": {
+            "img_dir": "coco/train2014",
+            "ann_file": "mdetr_annotations/finetune_refcoco_val.json",
+            "is_train": False,
+        },
+        "refcoco_testA": {
+            "img_dir": "coco/train2014",
+            "ann_file": "mdetr_annotations/finetune_refcoco_testA.json",
+            "is_train": False,
+        },
+        "refcoco_testB": {
+            "img_dir": "coco/train2014",
+            "ann_file": "mdetr_annotations/finetune_refcoco_testB.json",
+            "is_train": False,
+        },
+        "refcoco+_train": {
+            "img_dir": "coco/train2014",
+            "ann_file": "mdetr_annotations/finetune_refcoco+_train.json",
+            "is_train": True,
+        },
+        "refcoco+_val": {
+            "img_dir": "coco/train2014",
+            "ann_file": "mdetr_annotations/finetune_refcoco+_val.json",
+            "is_train": False,
+        },
+        "refcoco+_testA": {
+            "img_dir": "coco/train2014",
+            "ann_file": "mdetr_annotations/finetune_refcoco+_testA.json",
+            "is_train": False,
+        },
+        "refcoco+_testB": {
+            "img_dir": "coco/train2014",
+            "ann_file": "mdetr_annotations/finetune_refcoco+_testB.json",
+            "is_train": False,
+        },
+        "refcocog_train": {
+            "img_dir": "coco/train2014",
+            "ann_file": "mdetr_annotations/finetune_refcocog_train.json",
+            "is_train": True,
+        },
+        "refcocog_val": {
+            "img_dir": "coco/train2014",
+            "ann_file": "mdetr_annotations/finetune_refcocog_val.json",
+            "is_train": False,
+        },
+        "refcocog_test": {
+            "img_dir": "coco/train2014",
+            "ann_file": "mdetr_annotations/finetune_refcocog_test_corrected.json",
+            "is_train": False,
+        },
+        # gqa
+        "gqa_val": {"img_dir": "gqa/images", "ann_file": "mdetr_annotations/final_gqa_val.json", "is_train": False},
+        # phrasecut
+        "phrasecut_train": {
+            "img_dir": "gqa/images",
+            "ann_file": "mdetr_annotations/finetune_phrasecut_train.json",
+            "is_train": True,
+        },
+        # caption
+        "bing_caption_train": {
+            "yaml_path": "BingData/predict_yaml",
+            "yaml_name": "dreamstime_com_dyhead_objvg_e39",
+            "yaml_name_no_coco": "dreamstime_com_Detection_Pretrain_NoCOCO_Packed125",
+            "is_train": True,
+        },
+        # od to grounding
+        # coco tsv
+        "coco_dt_train": {
+            "dataset_file": "coco_dt",
+            "yaml_path": "coco_tsv/coco_obj.yaml",
+            "is_train": True,
+        },
+        "COCO_odinw_train_8copy_dt_train": {
+            "dataset_file": "coco_odinw_dt",
+            "yaml_path": "coco_tsv/COCO_odinw_train_8copy.yaml",
+            "is_train": True,
+        },
+        "COCO_odinw_val_dt_train": {
+            "dataset_file": "coco_odinw_dt",
+            "yaml_path": "coco_tsv/COCO_odinw_val.yaml",
+            "is_train": False,
+        },
+        # lvis tsv
+        "lvisv1_dt_train": {
+            "dataset_file": "lvisv1_dt",
+            "yaml_path": "coco_tsv/LVIS_v1_train.yaml",
+            "is_train": True,
+        },
+        "LVIS_odinw_train_8copy_dt_train": {
+            "dataset_file": "coco_odinw_dt",
+            "yaml_path": "coco_tsv/LVIS_odinw_train_8copy.yaml",
+            "is_train": True,
+        },
+        # object365 tsv
+        "object365_dt_train": {
+            "dataset_file": "object365_dt",
+            "yaml_path": "Objects365/objects365_train_vgoiv6.cas2000.yaml",
+            "is_train": True,
+        },
+        "object365_odinw_2copy_dt_train": {
+            "dataset_file": "object365_odinw_dt",
+            "yaml_path": "Objects365/objects365_train_odinw.cas2000_2copy.yaml",
+            "is_train": True,
+        },
+        "objects365_odtsv_train": {
+            "dataset_file": "objects365_odtsv",
+            "yaml_path": "Objects365/train.cas2000.yaml",
+            "is_train": True,
+        },
+        "objects365_odtsv_val": {
+            "dataset_file": "objects365_odtsv",
+            "yaml_path": "Objects365/val.yaml",
+            "is_train": False,
+        },
+        # ImagetNet OD
+        "imagenetod_train_odinw_2copy_dt": {
+            "dataset_file": "imagenetod_odinw_dt",
+            "yaml_path": "imagenet_od/imagenetod_train_odinw_2copy.yaml",
+            "is_train": True,
+        },
+        # OpenImage OD
+        "oi_train_odinw_dt": {
+            "dataset_file": "oi_odinw_dt",
+            "yaml_path": "openimages_v5c/oi_train_odinw.cas.2000.yaml",
+            "is_train": True,
+        },
+        # vg tsv
+        "vg_dt_train": {
+            "dataset_file": "vg_dt",
+            "yaml_path": "visualgenome/train_vgoi6_clipped.yaml",
+            "is_train": True,
+        },
+        "vg_odinw_clipped_8copy_dt_train": {
+            "dataset_file": "vg_odinw_clipped_8copy_dt",
+            "yaml_path": "visualgenome/train_odinw_clipped_8copy.yaml",
+            "is_train": True,
+        },
+        "vg_vgoi6_clipped_8copy_dt_train": {
+            "dataset_file": "vg_vgoi6_clipped_8copy_dt",
+            "yaml_path": "visualgenome/train_vgoi6_clipped_8copy.yaml",
+            "is_train": True,
+        },
+        # coco json
+        "coco_grounding_train": {
+            "img_dir": "coco/train2017",
+            "ann_file": "coco/annotations/instances_train2017.json",
+            "is_train": True,
+        },
+        "lvis_grounding_train": {"img_dir": "coco", "ann_file": "coco/annotations/lvis_od_train.json"},
+        "lvis_evaluation_val": {
+            "img_dir": "lvis/coco2017",
+            "ann_file": "lvis/lvis_v1_minival_inserted_image_name.json",
+            "is_train": False,
+        },
+        "lvis_val": {
+            "img_dir": "coco",
+            "ann_file": "coco/annotations/lvis_od_val.json"},
+        # legacy detection dataset
+        "hsd_v001": {"img_dir": "hsd/20170901_Detection_HeadShoulder.V001/RawImages", "ann_file": "hsd/HSD_V001.json"},
+        "hsd_hddb": {"img_dir": "hddb/Images", "ann_file": "hddb/HDDB.json"},
+        "opencoco_train": {"img_dir": "openimages/train", "ann_file": "openimages/opencoco_train.json"},
+        "opencoco_val": {"img_dir": "openimages/val", "ann_file": "openimages/opencoco_val.json"},
+        "opencoco_test": {"img_dir": "openimages/test", "ann_file": "openimages/opencoco_test.json"},
+        "openhuman_train": {"img_dir": "openimages/train", "ann_file": "openimages/openhuman_train.json"},
+        "openhuman_val": {"img_dir": "openimages/val", "ann_file": "openimages/openhuman_val.json"},
+        "openhuman_test": {"img_dir": "openimages/test", "ann_file": "openimages/openhuman_test.json"},
+        "opencrowd_train": {"img_dir": "openimages/train", "ann_file": "openimages/opencrowd_train.json"},
+        "opencrowd_val": {"img_dir": "openimages/val", "ann_file": "openimages/opencrowd_val.json"},
+        "opencrowd_test": {"img_dir": "openimages/test", "ann_file": "openimages/opencrowd_test.json"},
+        "opencar_train": {"img_dir": "openimages/train", "ann_file": "openimages/opencar_train.json"},
+        "opencar_val": {"img_dir": "openimages/val", "ann_file": "openimages/opencar_val.json"},
+        "opencar_test": {"img_dir": "openimages/test", "ann_file": "openimages/opencar_test.json"},
+        "openhumancar_train": {"img_dir": "openimages/train", "ann_file": "openimages/openhumancar_train.json"},
+        "openhumancar_val": {"img_dir": "openimages/val", "ann_file": "openimages/openhumancar_val.json"},
+        "openhuamncar_test": {"img_dir": "openimages/test", "ann_file": "openimages/openhumancar_test.json"},
+        "open500_train": {
+            "img_dir": "openimages/train",
+            "ann_file": "openimages/openimages_challenge_2019_train_bbox.json",
+        },
+        "open500_val": {
+            "img_dir": "openimages/val",
+            "ann_file": "openimages/openimages_challenge_2019_val_bbox.json",
+        },
+        "openproposal_test": {
+            "img_dir": "openimages/test2019",
+            "ann_file": "openimages/proposals_test.json",
+        },
+        "object365_train": {"img_dir": "object365/train", "ann_file": "object365/objects365_train.json"},
+        "object365_val": {"img_dir": "object365/val", "ann_file": "object365/objects365_val.json"},
+        "lvis_train": {"img_dir": "coco", "ann_file": "coco/annotations/lvis_od_train.json"},
+        "lvis_val": {"img_dir": "coco", "ann_file": "coco/annotations/lvis_od_val.json"},
+        "image200_train": {"img_dir": "imagenet-od/Data/DET/train", "ann_file": "imagenet-od/im200_train.json"},
+        "image200_val": {"img_dir": "imagenet-od/Data/DET/val", "ann_file": "imagenet-od/im200_val.json"},
+        "coco_2017_train": {"img_dir": "coco/train2017", "ann_file": "coco/annotations/instances_train2017.json"},
+        "coco_2017_val": {"img_dir": "coco/val2017", "ann_file": "coco/annotations/instances_val2017.json"},
+        "coco_2017_test": {"img_dir": "coco/test2017", "ann_file": "coco/annotations/image_info_test-dev2017.json"},
+        "coco10_train": {"img_dir": "coco/train2017", "ann_file": "coco/annotations/instances_minitrain2017.json"},
+        "coco_2014_train": {"img_dir": "coco/train2014", "ann_file": "coco/annotations/instances_train2014.json"},
+        "coco_2014_val": {"img_dir": "coco/val2014", "ann_file": "coco/annotations/instances_val2014.json"},
+        "coco_2014_minival": {"img_dir": "coco/val2014", "ann_file": "coco/annotations/instances_minival2014.json"},
+        "coco_2014_valminusminival": {
+            "img_dir": "coco/val2014",
+            "ann_file": "coco/annotations/instances_valminusminival2014.json",
+        },
+        "coco_2014_train_partial": {
+            "img_dir": "coco/train2014",
+            "ann_file": "coco/annotations/partial0.2_train2014.json",
+        },
+        "coco_2014_valminusminival_partial": {
+            "img_dir": "coco/val2014",
+            "ann_file": "coco/annotations/partial0.2_valminusminival2014.json",
+        },
+        "coco_2014_train_few100": {"img_dir": "coco/train2014", "ann_file": "coco/annotations/few100_train2014.json"},
+        "coco_2014_train_few300": {"img_dir": "coco/train2014", "ann_file": "coco/annotations/few300_train2014.json"},
+        "coco_human_2014_train": {"img_dir": "coco/train2014", "ann_file": "coco/annotations/humans_train2014.json"},
+        "coco_human_2014_minival": {"img_dir": "coco/val2014", "ann_file": "coco/annotations/humans_minival2014.json"},
+        "coco_human_2014_valminusminival": {
+            "img_dir": "coco/val2014",
+            "ann_file": "coco/annotations/humans_valminusminival2014.json",
+        },
+        "coco_car_2014_train": {"img_dir": "coco/train2014", "ann_file": "coco/annotations/car_train2014.json"},
+        "coco_car_2014_minival": {"img_dir": "coco/val2014", "ann_file": "coco/annotations/car_minival2014.json"},
+        "coco_car_2014_valminusminival": {
+            "img_dir": "coco/val2014",
+            "ann_file": "coco/annotations/car_valminusminival2014.json",
+        },
+        "coco_humancar_2014_train": {
+            "img_dir": "coco/train2014",
+            "ann_file": "coco/annotations/humancar_train2014.json",
+        },
+        "coco_humancar_2014_minival": {
+            "img_dir": "coco/val2014",
+            "ann_file": "coco/annotations/humancar_minival2014.json",
+        },
+        "coco_humancar_2014_valminusminival": {
+            "img_dir": "coco/val2014",
+            "ann_file": "coco/annotations/humancar_valminusminival2014.json",
+        },
+        "coco_keypoint_2017_train": {
+            "img_dir": "coco/train2017",
+            "ann_file": "coco/annotations/person_keypoints_train2017.json",
+        },
+        "coco_keypoint_2017_val": {
+            "img_dir": "coco/val2017",
+            "ann_file": "coco/annotations/person_keypoints_val2017.json",
+        },
+        "coco_headshoulder_2017_train": {
+            "img_dir": "coco/train2017",
+            "ann_file": "coco/annotations/headshoulder_train2017.json",
+        },
+        "coco_headshoulder_2017_val": {
+            "img_dir": "coco/val2017",
+            "ann_file": "coco/annotations/headshoulder_val2017.json",
+        },
+        "coco_hskeypoint_2017_train": {
+            "img_dir": "coco/train2017",
+            "ann_file": "coco/annotations/person_hskeypoints_train2017.json",
+        },
+        "coco_hskeypoint_2017_val": {
+            "img_dir": "coco/val2017",
+            "ann_file": "coco/annotations/person_hskeypoints_val2017.json",
+        },
+        "voc_2007_train": {"data_dir": "voc/VOC2007", "split": "train"},
+        "voc_2007_train_cocostyle": {
+            "img_dir": "voc/VOC2007/JPEGImages",
+            "ann_file": "voc/VOC2007/Annotations/pascal_train2007.json",
+        },
+        "voc_2007_val": {"data_dir": "voc/VOC2007", "split": "val"},
+        "voc_2007_val_cocostyle": {
+            "img_dir": "voc/VOC2007/JPEGImages",
+            "ann_file": "voc/VOC2007/Annotations/pascal_val2007.json",
+        },
+        "voc_2007_test": {"data_dir": "voc/VOC2007", "split": "test"},
+        "voc_2007_test_cocostyle": {
+            "img_dir": "voc/VOC2007/JPEGImages",
+            "ann_file": "voc/VOC2007/Annotations/pascal_test2007.json",
+        },
+        "voc_2012_train": {"data_dir": "voc/VOC2012", "split": "train"},
+        "voc_2012_train_cocostyle": {
+            "img_dir": "voc/VOC2012/JPEGImages",
+            "ann_file": "voc/VOC2012/Annotations/pascal_train2012.json",
+        },
+        "voc_2012_val": {"data_dir": "voc/VOC2012", "split": "val"},
+        "voc_2012_val_cocostyle": {
+            "img_dir": "voc/VOC2012/JPEGImages",
+            "ann_file": "voc/VOC2012/Annotations/pascal_val2012.json",
+        },
+        "voc_2012_test": {
+            "data_dir": "voc/VOC2012",
+            "split": "test"
+            # PASCAL VOC2012 doesn't made the test annotations available, so there's no json annotation
+        },
+        "cityscapes_fine_instanceonly_seg_train_cocostyle": {
+            "img_dir": "cityscapes/images",
+            "ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_train.json",
+        },
+        "cityscapes_fine_instanceonly_seg_val_cocostyle": {
+            "img_dir": "cityscapes/images",
+            "ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_val.json",
+        },
+        "cityscapes_fine_instanceonly_seg_test_cocostyle": {
+            "img_dir": "cityscapes/images",
+            "ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_test.json",
+        },
+        "crowdhuman_train": {"img_dir": "CrowdHuman/Images", "ann_file": "CrowdHuman/crowdhuman_train.json"},
+        "crowdhuman_val": {"img_dir": "CrowdHuman/Images", "ann_file": "CrowdHuman/crowdhuman_val.json"},
+        "crowdhead_train": {"img_dir": "CrowdHuman/Images", "ann_file": "CrowdHuman/crowdhead_train.json"},
+        "crowdhead_val": {"img_dir": "CrowdHuman/Images", "ann_file": "CrowdHuman/crowdhead_val.json"},
+        "crowdfull_train": {"img_dir": "CrowdHuman/Images", "ann_file": "CrowdHuman/crowdfull_train.json"},
+        "crowdfull_val": {"img_dir": "CrowdHuman/Images", "ann_file": "CrowdHuman/crowdfull_val.json"},
+        "ternium_train": {"img_dir": "ternium/images", "ann_file": "ternium/train_annotation.json"},
+        "ternium_val": {"img_dir": "ternium/images", "ann_file": "ternium/val_annotation.json"},
+        "ternium_test": {"img_dir": "ternium/images", "ann_file": "ternium/test_annotation.json"},
+        "ternium_test_crop": {"img_dir": "ternium/test_motion_crop", "ann_file": "ternium/test_motion_crop.json"},
+        "ternium_train_aug": {"img_dir": "ternium/train_crop_aug", "ann_file": "ternium/train_crop_aug.json"},
+        "ternium_test_aug": {"img_dir": "ternium/test_crop_aug", "ann_file": "ternium/test_motion_crop_aug.json"},
+        "ternium_vh_train": {
+            "img_dir": "ternium-vehicle/train_dataset_coco/images",
+            "ann_file": "ternium-vehicle/train_dataset_coco/coco_annotation.json",
+        },
+        "ternium_vh_val": {
+            "img_dir": "ternium-vehicle/validation_dataset_coco/images",
+            "ann_file": "ternium-vehicle/validation_dataset_coco/coco_annotation.json",
+        },
+        "msra_traffic": {"img_dir": "msra-traffic/Images", "ann_file": "msra-traffic/annotation.json"},
+        "msra_traffic_car": {"img_dir": "msra-traffic/Images", "ann_file": "msra-traffic/car_annotation.json"},
+        "msra_traffic_humancar": {
+            "img_dir": "msra-traffic/Images",
+            "ann_file": "msra-traffic/humancar_annotation.json",
+        },
+        "jigsaw_car_train": {"img_dir": "jigsaw", "ann_file": "jigsaw/train.json"},
+        "jigsaw_car_val": {"img_dir": "jigsaw", "ann_file": "jigsaw/val.json"},
+        "miotcd_train": {"img_dir": "MIO-TCD/MIO-TCD-Localization", "ann_file": "MIO-TCD/train.json"},
+        "miotcd_val": {"img_dir": "MIO-TCD/MIO-TCD-Localization", "ann_file": "MIO-TCD/val.json"},
+        "detrac_train": {"img_dir": "detrac/Insight-MVT_Annotation_Train", "ann_file": "detrac/train.json"},
+        "detrac_val": {"img_dir": "detrac/Insight-MVT_Annotation_Train", "ann_file": "detrac/val.json"},
+        "mrw": {"img_dir": "mrw/clips", "ann_file": "mrw/annotations.json"},
+        "mrw_bg": {"img_dir": "mrw/bg", "ann_file": "mrw/bg_annotations.json"},
+        "webmarket_bg": {"img_dir": "webmarket", "ann_file": "webmarket/bg_annotations.json"},
+        "mot17_train": {"img_dir": "mot/MOT17Det", "ann_file": "mot/MOT17Det/train.json"},
+        "egohands": {"img_dir": "egohands/images", "ann_file": "egohands/egohands.json"},
+        "hof": {"img_dir": "hof/images_original_size", "ann_file": "hof/train.json"},
+        "vlmhof": {"img_dir": "vlmhof/RGB", "ann_file": "vlmhof/train.json"},
+        "vgghands_train": {"img_dir": "vgghands/training_dataset", "ann_file": "vgghands/training.json"},
+        "vgghands_val": {"img_dir": "vgghands/validation_dataset", "ann_file": "vgghands/validation.json"},
+        "vgghands_test": {"img_dir": "vgghands/test_dataset", "ann_file": "vgghands/test.json"},
+        "od:coco_train": {"img_dir": "coco/train2017", "ann_file": "coco/annotations/od_train2017.json"},
+        "od:coco_val": {"img_dir": "coco/val2017", "ann_file": "coco/annotations/od_val2017.json"},
+        "od:lvis_train": {"img_dir": "coco", "ann_file": "coco/annotations/od_train-lvis.json"},
+        "od:lvis_val": {"img_dir": "coco", "ann_file": "coco/annotations/od_val-lvis.json"},
+        "od:o365_train": {"img_dir": "object365/train", "ann_file": "object365/od_train.json"},
+        "od:o365_val": {"img_dir": "object365/val", "ann_file": "object365/od_val.json"},
+        "od:oi500_train": {
+            "img_dir": "openimages/train",
+            "ann_file": "openimages/od_train2019.json",
+            "paste_dir": "openimages/panoptic_train_challenge_2019",
+            "paste_file": "openimages/panoptic_train2019.json",
+        },
+        "od:oi500_val": {
+            "img_dir": "openimages/val",
+            "ann_file": "openimages/od_val2019.json",
+            "paste_dir": "openimages/panoptic_val_challenge_2019",
+            "paste_file": "openimages/panoptic_val2019.json",
+        },
+        "od:im200_train": {"img_dir": "imagenet-od/Data/DET/train", "ann_file": "imagenet-od/train.json"},
+        "od:im200_val": {"img_dir": "imagenet-od/Data/DET/val", "ann_file": "imagenet-od/val.json"},
+        "cv:animal661_train": {"img_dir": "cvtasks/animal-661/images", "ann_file": "cvtasks/animal-661/train.json"},
+        "cv:animal661_test": {"img_dir": "cvtasks/animal-661/images", "ann_file": "cvtasks/animal-661/test.json"},
+        "cv:seeingai_train": {"img_dir": "cvtasks/SeeingAI/train.tsv", "ann_file": "cvtasks/SeeingAI/train.json"},
+        "cv:seeingai_test": {"img_dir": "cvtasks/SeeingAI/test.tsv", "ann_file": "cvtasks/SeeingAI/test.json"},
+        "cv:office_train": {
+            "img_dir": "cvtasks/Ping-Office-Env/train.tsv",
+            "ann_file": "cvtasks/Ping-Office-Env/train.json",
+        },
+        "cv:office_test": {
+            "img_dir": "cvtasks/Ping-Office-Env/test.tsv",
+            "ann_file": "cvtasks/Ping-Office-Env/test.json",
+        },
+        "cv:logo_train": {"img_dir": "cvtasks/Ping-Logo", "ann_file": "cvtasks/Ping-Logo/train.json"},
+        "cv:logo_test": {"img_dir": "cvtasks/Ping-Logo", "ann_file": "cvtasks/Ping-Logo/test.json"},
+        "cv:nba_train": {"img_dir": "cvtasks/Ping-NBA", "ann_file": "cvtasks/Ping-NBA/train.json"},
+        "cv:nba_test": {"img_dir": "cvtasks/Ping-NBA", "ann_file": "cvtasks/Ping-NBA/test.json"},
+        "cv:traffic_train": {"img_dir": "cvtasks/TrafficData/train.tsv", "ann_file": "cvtasks/TrafficData/train.json"},
+        "cv:traffic_test": {"img_dir": "cvtasks/TrafficData/test.tsv", "ann_file": "cvtasks/TrafficData/test.json"},
+        "cv:fashion5k_train": {"img_dir": "cvtasks/fashion5k", "ann_file": "cvtasks/fashion5k/train.json"},
+        "cv:fashion5k_test": {"img_dir": "cvtasks/fashion5k", "ann_file": "cvtasks/fashion5k/test.json"},
+        "cv:malaria_train": {"img_dir": "cvtasks/malaria", "ann_file": "cvtasks/malaria/train.json"},
+        "cv:malaria_test": {"img_dir": "cvtasks/malaria", "ann_file": "cvtasks/malaria/test.json"},
+        "cv:product_train": {
+            "img_dir": "cvtasks/product_detection",
+            "ann_file": "cvtasks/product_detection/train.json",
+        },
+        "cv:product_test": {"img_dir": "cvtasks/product_detection", "ann_file": "cvtasks/product_detection/test.json"},
+        "vl:vg_train": {"yaml_file": "vlp/visualgenome/train_vgoi6_clipped.yaml"},
+        "vl:vg_test": {"yaml_file": "vlp/visualgenome/test_vgoi6_clipped.yaml"},
+        "imagenet_train": {"img_dir": "imagenet-tsv/train.tsv", "ann_file": None},
+        "imagenet_val": {"img_dir": "imagenet-tsv/val.tsv", "ann_file": None},
+        "paco_lvis_v1_train_grounding":{
+            "img_dir": "coco",
+            "ann_file": "paco/paco_lvis_v1_train.json"
+        },
+        "paco_lvis_v1_val":{
+            "img_dir": "coco",
+            "ann_file": "paco/paco_lvis_v1_val.json"
+        },
+        "paco_lvis_v1_test":
+        {
+            "img_dir": "coco",
+            "ann_file": "paco/paco_lvis_v1_test.json"
+        },
+        "omnilabel_val": {"img_dir": "omnilabel/", "ann_file": "omnilabel/dataset_all_val_v0.1.3.json"},
+        "omnilabel_val_coco": {"img_dir": "omnilabel/", "ann_file": "omnilabel/dataset_all_val_v0.1.3_coco.json"},
+        "omnilabel_val_o365": {"img_dir": "omnilabel/", "ann_file": "omnilabel/dataset_all_val_v0.1.3_object365.json"},
+        "omnilabel_val_oi_v5": {"img_dir": "omnilabel/", "ann_file": "omnilabel/dataset_all_val_v0.1.3_openimagesv5.json"},
+        "omnilabel_test": {"img_dir": "omnilabel/", "ann_file": "omnilabel/dataset_all_test_v0.1.3.json"},
+    }
+    @staticmethod
+    def set(name, info):
+        DatasetCatalog.DATASETS.update({name: info})
+    @staticmethod
+    def get(name):
+        if name.endswith("_bg"):
+            attrs = DatasetCatalog.DATASETS[name]
+            data_dir = try_to_find(attrs["ann_file"], return_dir=True)
+            args = dict(
+                root=os.path.join(data_dir, attrs["img_dir"]),
+                ann_file=os.path.join(data_dir, attrs["ann_file"]),
+            )
+            return dict(
+                factory="Background",
+                args=args,
+            )
+        else:
+            if "bing" in name.split("_"):
+                attrs = DatasetCatalog.DATASETS["bing_caption_train"]
+            else:
+                attrs = DatasetCatalog.DATASETS[name]
+            # if "yaml_file" in attrs:
+            #     yaml_file = try_to_find(attrs["yaml_file"], return_dir=False)
+            #     args = dict(yaml_file=yaml_file)
+            #     return dict(
+            #         factory="VGTSVDataset",
+            #         args=args,
+            #     )
+            # elif attrs["img_dir"].endswith('tsv'):
+            #     try:
+            #         data_dir = try_to_find(attrs["img_dir"], return_dir=True)
+            #         if attrs["ann_file"] is None:
+            #             map_file = None
+            #         elif attrs["ann_file"].startswith("./"):
+            #             map_file = attrs["ann_file"]
+            #         else:
+            #             map_file = os.path.join(data_dir, attrs["ann_file"])
+            #     except:
+            #         return None
+            #     args = dict(
+            #         tsv_file=os.path.join(data_dir, attrs["img_dir"]),
+            #         anno_file=map_file,
+            #     )
+            #     return dict(
+            #         factory="TSVDataset",
+            #         args=args,
+            #     )
+            if "voc" in name and "split" in attrs:
+                data_dir = try_to_find(attrs["data_dir"], return_dir=True)
+                args = dict(
+                    data_dir=os.path.join(data_dir, attrs["data_dir"]),
+                    split=attrs["split"],
+                )
+                return dict(
+                    factory="PascalVOCDataset",
+                    args=args,
+                )
+            elif "omnilabel" in name:
+                img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+                ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+                args = dict(
+                    img_folder=os.path.join(img_dir, attrs["img_dir"]),
+                    ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+                )
+                return dict(
+                    factory="OmniLabelDataset",
+                    args=args,
+                )
+            elif "mixed" in name:
+                vg_img_dir = try_to_find(attrs["vg_img_dir"], return_dir=True)
+                coco_img_dir = try_to_find(attrs["coco_img_dir"], return_dir=True)
+                ann_file = try_to_find(attrs["ann_file"], return_dir=True)
+                args = dict(
+                    img_folder_coco=os.path.join(coco_img_dir, attrs["coco_img_dir"]),
+                    img_folder_vg=os.path.join(vg_img_dir, attrs["vg_img_dir"]),
+                    ann_file=os.path.join(ann_file, attrs["ann_file"]),
+                )
+                return dict(
+                    factory="MixedDataset",
+                    args=args,
+                )
+            elif "flickr" in name:
+                img_dir = try_to_find(attrs["img_folder"], return_dir=True)
+                ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+                args = dict(
+                    img_folder=os.path.join(img_dir, attrs["img_folder"]),
+                    ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+                    is_train=attrs["is_train"],
+                )
+                return dict(
+                    factory="FlickrDataset",
+                    args=args,
+                )
+            elif "refexp" in name or "refcoco" in name:
+                img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+                ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+                args = dict(
+                    img_folder=os.path.join(img_dir, attrs["img_dir"]),
+                    ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+                )
+                return dict(
+                    factory="RefExpDataset",
+                    args=args,
+                )
+            elif "gqa" in name:
+                img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+                ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+                args = dict(
+                    img_folder=os.path.join(img_dir, attrs["img_dir"]),
+                    ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+                )
+                return dict(
+                    factory="GQADataset",
+                    args=args,
+                )
+            elif "phrasecut" in name:
+                img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+                ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+                args = dict(
+                    img_folder=os.path.join(img_dir, attrs["img_dir"]),
+                    ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+                )
+                return dict(
+                    factory="PhrasecutDetection",
+                    args=args,
+                )
+            elif "_caption" in name:
+                yaml_path = try_to_find(attrs["yaml_path"], return_dir=True)
+                if "no_coco" in name:
+                    yaml_name = attrs["yaml_name_no_coco"]
+                else:
+                    yaml_name = attrs["yaml_name"]
+                yaml_file_name = "{}.{}.yaml".format(yaml_name, name.split("_")[2])
+                args = dict(yaml_file=os.path.join(yaml_path, attrs["yaml_path"], yaml_file_name))
+                return dict(
+                    factory="CaptionTSV",
+                    args=args,
+                )
+            elif "inferencecap" in name:
+                yaml_file_name = try_to_find(attrs["yaml_path"])
+                args = dict(yaml_file=yaml_file_name)
+                return dict(
+                    factory="CaptionTSV",
+                    args=args,
+                )
+            elif "pseudo_data" in name:
+                args = dict(yaml_file=try_to_find(attrs["yaml_path"]))
+                return dict(
+                    factory="PseudoData",
+                    args=args,
+                )
+            elif "_dt" in name:
+                dataset_file = attrs["dataset_file"]
+                yaml_path = try_to_find(attrs["yaml_path"], return_dir=True)
+                args = dict(
+                    name=dataset_file,
+                    yaml_file=os.path.join(yaml_path, attrs["yaml_path"]),
+                )
+                return dict(
+                    factory="CocoDetectionTSV",
+                    args=args,
+                )
+            elif "_odtsv" in name:
+                dataset_file = attrs["dataset_file"]
+                yaml_path = try_to_find(attrs["yaml_path"], return_dir=True)
+                args = dict(
+                    name=dataset_file,
+                    yaml_file=os.path.join(yaml_path, attrs["yaml_path"]),
+                )
+                return dict(
+                    factory="ODTSVDataset",
+                    args=args,
+                )
+            elif "_grounding" in name:
+                img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+                ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+                args = dict(
+                    img_folder=os.path.join(img_dir, attrs["img_dir"]),
+                    ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+                )
+                return dict(
+                    factory="CocoGrounding",
+                    args=args,
+                )
+            elif "lvis_evaluation" in name:
+                img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+                ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+                args = dict(
+                    img_folder=os.path.join(img_dir, attrs["img_dir"]),
+                    ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+                )
+                return dict(
+                    factory="LvisDetection",
+                    args=args,
+                )
+            elif "paco" in name:
+                img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+                ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+                args = dict(
+                    img_folder=os.path.join(img_dir, attrs["img_dir"]),
+                    ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+                )
+                return dict(
+                    factory="PacoDetection",
+                    args=args,
+                )
+            else:
+                ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
+                img_dir = try_to_find(attrs["img_dir"], return_dir=True)
+                args = dict(
+                    root=os.path.join(img_dir, attrs["img_dir"]),
+                    ann_file=os.path.join(ann_dir, attrs["ann_file"]),
+                )
+                for k, v in attrs.items():
+                    args.update({k: os.path.join(ann_dir, v)})
+                return dict(
+                    factory="COCODataset",
+                    args=args,
+                )
+        raise RuntimeError("Dataset not available: {}".format(name))
+class ModelCatalog(object):
+    S3_C2_DETECTRON_URL = "https://dl.fbaipublicfiles.com/detectron"
+    C2_IMAGENET_MODELS = {
+        "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl",
+        "MSRA/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl",
+        "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl",
+        "MSRA/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl",
+        "FAIR/20171220/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl",
+        "FAIR/20171220/X-101-64x4d": "ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl",
+    }
+    C2_DETECTRON_SUFFIX = "output/train/coco_2014_train%3Acoco_2014_valminusminival/generalized_rcnn/model_final.pkl"
+    C2_DETECTRON_MODELS = {
+        "35857197/e2e_faster_rcnn_R-50-C4_1x": "01_33_49.iAX0mXvW",
+        "35857345/e2e_faster_rcnn_R-50-FPN_1x": "01_36_30.cUF7QR7I",
+        "35857890/e2e_faster_rcnn_R-101-FPN_1x": "01_38_50.sNxI7sX7",
+        "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "06_31_39.5MIHi1fZ",
+        "35858791/e2e_mask_rcnn_R-50-C4_1x": "01_45_57.ZgkA7hPB",
+        "35858933/e2e_mask_rcnn_R-50-FPN_1x": "01_48_14.DzEQe4wC",
+        "35861795/e2e_mask_rcnn_R-101-FPN_1x": "02_31_37.KqyEK4tT",
+        "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "06_35_59.RZotkLKI",
+    }
+    @staticmethod
+    def get(name):
+        if name.startswith("Caffe2Detectron/COCO"):
+            return ModelCatalog.get_c2_detectron_12_2017_baselines(name)
+        if name.startswith("ImageNetPretrained"):
+            return ModelCatalog.get_c2_imagenet_pretrained(name)
+        raise RuntimeError("model not present in the catalog {}".format(name))
+    @staticmethod
+    def get_c2_imagenet_pretrained(name):
+        prefix = ModelCatalog.S3_C2_DETECTRON_URL
+        name = name[len("ImageNetPretrained/") :]
+        name = ModelCatalog.C2_IMAGENET_MODELS[name]
+        url = "/".join([prefix, name])
+        return url
+    @staticmethod
+    def get_c2_detectron_12_2017_baselines(name):
+        # Detectron C2 models are stored following the structure
+        # prefix/<model_id>/2012_2017_baselines/<model_name>.yaml.<signature>/suffix
+        # we use as identifiers in the catalog Caffe2Detectron/COCO/<model_id>/<model_name>
+        prefix = ModelCatalog.S3_C2_DETECTRON_URL
+        suffix = ModelCatalog.C2_DETECTRON_SUFFIX
+        # remove identification prefix
+        name = name[len("Caffe2Detectron/COCO/") :]
+        # split in <model_id> and <model_name>
+        model_id, model_name = name.split("/")
+        # parsing to make it match the url address from the Caffe2 models
+        model_name = "{}.yaml".format(model_name)
+        signature = ModelCatalog.C2_DETECTRON_MODELS[name]
+        unique_name = ".".join([model_name, signature])
+        url = "/".join([prefix, model_id, "12_2017_baselines", unique_name, suffix])
+        return url

maskrcnn_benchmark/csrc/ROIAlign.h ADDED Viewed

	@@ -0,0 +1,46 @@

+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+#include "cpu/vision.h"
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+// Interface for Python
+at::Tensor ROIAlign_forward(const at::Tensor& input,
+                            const at::Tensor& rois,
+                            const float spatial_scale,
+                            const int pooled_height,
+                            const int pooled_width,
+                            const int sampling_ratio) {
+  if (input.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
+}
+at::Tensor ROIAlign_backward(const at::Tensor& grad,
+                             const at::Tensor& rois,
+                             const float spatial_scale,
+                             const int pooled_height,
+                             const int pooled_width,
+                             const int batch_size,
+                             const int channels,
+                             const int height,
+                             const int width,
+                             const int sampling_ratio) {
+  if (grad.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}

maskrcnn_benchmark/csrc/ROIPool.h ADDED Viewed

	@@ -0,0 +1,48 @@

+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+#include "cpu/vision.h"
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+std::tuple<at::Tensor, at::Tensor> ROIPool_forward(const at::Tensor& input,
+                                const at::Tensor& rois,
+                                const float spatial_scale,
+                                const int pooled_height,
+                                const int pooled_width) {
+  if (input.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+at::Tensor ROIPool_backward(const at::Tensor& grad,
+                                 const at::Tensor& input,
+                                 const at::Tensor& rois,
+                                 const at::Tensor& argmax,
+                                 const float spatial_scale,
+                                 const int pooled_height,
+                                 const int pooled_width,
+                                 const int batch_size,
+                                 const int channels,
+                                 const int height,
+                                 const int width) {
+  if (grad.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}

maskrcnn_benchmark/csrc/SigmoidFocalLoss.h ADDED Viewed

	@@ -0,0 +1,41 @@

+#pragma once
+#include "cpu/vision.h"
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+// Interface for Python
+at::Tensor SigmoidFocalLoss_forward(
+		const at::Tensor& logits,
+                const at::Tensor& targets,
+		const int num_classes,
+		const float gamma,
+		const float alpha) {
+  if (logits.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+at::Tensor SigmoidFocalLoss_backward(
+			     const at::Tensor& logits,
+                             const at::Tensor& targets,
+			     const at::Tensor& d_losses,
+			     const int num_classes,
+			     const float gamma,
+			     const float alpha) {
+  if (logits.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}

maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp ADDED Viewed

	@@ -0,0 +1,257 @@

+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include "cpu/vision.h"
+// implementation taken from Caffe2
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int iy_upper,
+    const int ix_upper,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+          // save weights and indeces
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+template <typename T>
+void ROIAlignForward_cpu_kernel(
+    const int nthreads,
+    const T* bottom_data,
+    const T& spatial_scale,
+    const int channels,
+    const int height,
+    const int width,
+    const int pooled_height,
+    const int pooled_width,
+    const int sampling_ratio,
+    const T* bottom_rois,
+    //int roi_cols,
+    T* top_data) {
+  //AT_ASSERT(roi_cols == 4 || roi_cols == 5);
+  int roi_cols = 5;
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+    // roi could have 4 or 5 columns
+    const T* offset_bottom_rois = bottom_rois + n * roi_cols;
+    int roi_batch_ind = 0;
+    if (roi_cols == 5) {
+      roi_batch_ind = offset_bottom_rois[0];
+      offset_bottom_rois++;
+    }
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+    // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
+    // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
+    // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
+    // Force malformed ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+    // we want to precalculate indeces and weights shared by all chanels,
+    // this is the key point of optimiation
+    std::vector<PreCalc<T>> pre_calc(
+        roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height,
+        width,
+        pooled_height,
+        pooled_width,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        roi_start_h,
+        roi_start_w,
+        bin_size_h,
+        bin_size_w,
+        roi_bin_grid_h,
+        roi_bin_grid_w,
+        pre_calc);
+      for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_bottom_data =
+          bottom_data + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_bottom_data[pc.pos1] +
+                  pc.w2 * offset_bottom_data[pc.pos2] +
+                  pc.w3 * offset_bottom_data[pc.pos3] +
+                  pc.w4 * offset_bottom_data[pc.pos4];
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+          top_data[index] = output_val;
+        } // for pw
+      } // for ph
+    } // for c
+  } // for n
+}
+at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
+                                const at::Tensor& rois,
+                                const float spatial_scale,
+                                const int pooled_height,
+                                const int pooled_width,
+                                const int sampling_ratio) {
+  AT_ASSERTM(!input.device().is_cuda(), "input must be a CPU tensor");
+  AT_ASSERTM(!rois.device().is_cuda(), "rois must be a CPU tensor");
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+  auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+  if (output.numel() == 0) {
+    return output;
+  }
+  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
+    ROIAlignForward_cpu_kernel<scalar_t>(
+         output_size,
+         input.data_ptr<scalar_t>(),
+         spatial_scale,
+         channels,
+         height,
+         width,
+         pooled_height,
+         pooled_width,
+         sampling_ratio,
+         rois.data_ptr<scalar_t>(),
+         output.data_ptr<scalar_t>());
+  });
+  return output;
+}

maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp ADDED Viewed

	@@ -0,0 +1,75 @@

+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include "cpu/vision.h"
+template <typename scalar_t>
+at::Tensor nms_cpu_kernel(const at::Tensor& dets,
+                          const at::Tensor& scores,
+                          const float threshold) {
+  AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
+  AT_ASSERTM(!scores.device().is_cuda(), "scores must be a CPU tensor");
+  AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
+  }
+  auto x1_t = dets.select(1, 0).contiguous();
+  auto y1_t = dets.select(1, 1).contiguous();
+  auto x2_t = dets.select(1, 2).contiguous();
+  auto y2_t = dets.select(1, 3).contiguous();
+  at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+  auto ndets = dets.size(0);
+  at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<scalar_t>();
+  auto y1 = y1_t.data_ptr<scalar_t>();
+  auto x2 = x2_t.data_ptr<scalar_t>();
+  auto y2 = y2_t.data_ptr<scalar_t>();
+  auto areas = areas_t.data_ptr<scalar_t>();
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1)
+      continue;
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto iarea = areas[i];
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1)
+        continue;
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
+      auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1 + 1);
+      auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1 + 1);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr >= threshold)
+        suppressed[j] = 1;
+   }
+  }
+  return at::nonzero(suppressed_t == 0).squeeze(1);
+}
+at::Tensor nms_cpu(const at::Tensor& dets,
+               const at::Tensor& scores,
+               const float threshold) {
+  at::Tensor result;
+  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] {
+    result = nms_cpu_kernel<scalar_t>(dets, scores, threshold);
+  });
+  return result;
+}

maskrcnn_benchmark/csrc/cpu/soft_nms.cpp ADDED Viewed

	@@ -0,0 +1,117 @@

+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include "cpu/vision.h"
+template <typename scalar_t>
+std::pair<at::Tensor, at::Tensor> soft_nms_cpu_kernel(const at::Tensor& dets,
+                                                      const at::Tensor& scores,
+                                                      const float threshold,
+                                                      const float sigma) {
+  AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
+  AT_ASSERTM(!scores.device().is_cuda(), "scores must be a CPU tensor");
+  AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
+  if (dets.numel() == 0) {
+    return std::make_pair(at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)),
+                          at::empty({0}, scores.options().dtype(at::kFloat).device(at::kCPU)));
+  }
+  auto x1_t = dets.select(1, 0).contiguous();
+  auto y1_t = dets.select(1, 1).contiguous();
+  auto x2_t = dets.select(1, 2).contiguous();
+  auto y2_t = dets.select(1, 3).contiguous();
+  auto scores_t = scores.clone();
+  at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
+  auto ndets = dets.size(0);
+  auto inds_t = at::arange(ndets, dets.options().dtype(at::kLong).device(at::kCPU));
+  auto x1 = x1_t.data_ptr<scalar_t>();
+  auto y1 = y1_t.data_ptr<scalar_t>();
+  auto x2 = x2_t.data_ptr<scalar_t>();
+  auto y2 = y2_t.data_ptr<scalar_t>();
+  auto s = scores_t.data_ptr<scalar_t>();
+  auto inds = inds_t.data_ptr<int64_t>();
+  auto areas = areas_t.data_ptr<scalar_t>();
+  for (int64_t i = 0; i < ndets; i++) {
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto is = s[i];
+    auto ii = inds[i];
+    auto iarea = areas[i];
+    auto maxpos = scores_t.slice(0, i, ndets).argmax().item<int64_t>() + i;
+    // add max box as a detection
+    x1[i] = x1[maxpos];
+    y1[i] = y1[maxpos];
+    x2[i] = x2[maxpos];
+    y2[i] = y2[maxpos];
+    s[i] = s[maxpos];
+    inds[i] = inds[maxpos];
+    areas[i] = areas[maxpos];
+    // swap ith box with position of max box
+    x1[maxpos] = ix1;
+    y1[maxpos] = iy1;
+    x2[maxpos] = ix2;
+    y2[maxpos] = iy2;
+    s[maxpos] = is;
+    inds[maxpos] = ii;
+    areas[maxpos] = iarea;
+    ix1 = x1[i];
+    iy1 = y1[i];
+    ix2 = x2[i];
+    iy2 = y2[i];
+    iarea = areas[i];
+    // NMS iterations, note that ndets changes if detection boxes
+    // fall below threshold
+    for (int64_t j = i + 1; j < ndets; j++) {
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
+      auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1 + 1);
+      auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1 + 1);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      s[j] = s[j] * std::exp(- std::pow(ovr, 2.0) / sigma);
+      // if box score falls below threshold, discard the box by
+      // swapping with last box update ndets
+      if (s[j] < threshold) {
+        x1[j] = x1[ndets - 1];
+        y1[j] = y1[ndets - 1];
+        x2[j] = x2[ndets - 1];
+        y2[j] = y2[ndets - 1];
+        s[j] = s[ndets - 1];
+        inds[j] = inds[ndets - 1];
+        areas[j] = areas[ndets - 1];
+        j--;
+        ndets--;
+      }
+    }
+  }
+  return std::make_pair(inds_t.slice(0, 0, ndets), scores_t.slice(0, 0, ndets));
+}
+std::pair<at::Tensor, at::Tensor> soft_nms_cpu(const at::Tensor& dets,
+                                               const at::Tensor& scores,
+                                               const float threshold,
+                                               const float sigma) {
+  std::pair<at::Tensor, at::Tensor> result;
+  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "soft_nms", [&] {
+    result = soft_nms_cpu_kernel<scalar_t>(dets, scores, threshold, sigma);
+  });
+  return result;
+}

maskrcnn_benchmark/csrc/cpu/vision.h ADDED Viewed

	@@ -0,0 +1,22 @@

+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+#include <torch/extension.h>
+at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
+                                const at::Tensor& rois,
+                                const float spatial_scale,
+                                const int pooled_height,
+                                const int pooled_width,
+                                const int sampling_ratio);
+at::Tensor nms_cpu(const at::Tensor& dets,
+                   const at::Tensor& scores,
+                   const float threshold);
+std::pair<at::Tensor, at::Tensor> soft_nms_cpu(const at::Tensor& dets,
+                                               const at::Tensor& scores,
+                                               const float threshold,
+                                               const float sigma);

maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu ADDED Viewed

	@@ -0,0 +1,346 @@

+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THC.h>
+#include <THC/THCAtomics.cuh>
+#include <THC/THCDeviceUtils.cuh>
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+template <typename T>
+__device__ T bilinear_interpolate(const T* bottom_data,
+    const int height, const int width,
+    T y, T x,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    //empty
+    return 0;
+  }
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+  int y_low = (int) y;
+  int x_low = (int) x;
+  int y_high;
+  int x_high;
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T) y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T) x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = bottom_data[y_low * width + x_low];
+  T v2 = bottom_data[y_low * width + x_high];
+  T v3 = bottom_data[y_high * width + x_low];
+  T v4 = bottom_data[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+template <typename T>
+__global__ void RoIAlignForward(const int nthreads, const T* bottom_data,
+    const T spatial_scale, const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int sampling_ratio,
+    const T* bottom_rois, T* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[3] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[4] * spatial_scale;
+    // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
+    // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
+    // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
+    // Force malformed ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+    const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width;
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix ++)
+      {
+        const T x = roi_start_w + pw * bin_size_w + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
+        T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+    top_data[index] = output_val;
+  }
+}
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height, const int width,
+    T y, T x,
+    T & w1, T & w2, T & w3, T & w4,
+    int & x_low, int & x_high, int & y_low, int & y_high,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    //empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+  y_low = (int) y;
+  x_low = (int) x;
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T) y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T) x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // reference in forward
+  // T v1 = bottom_data[y_low * width + x_low];
+  // T v2 = bottom_data[y_low * width + x_high];
+  // T v3 = bottom_data[y_high * width + x_low];
+  // T v4 = bottom_data[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+  return;
+}
+template <typename T>
+__global__ void RoIAlignBackwardFeature(const int nthreads, const T* top_diff,
+    const int num_rois, const T spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int sampling_ratio,
+    T* bottom_diff,
+    const T* bottom_rois) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    T roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    T roi_end_w = offset_bottom_rois[3] * spatial_scale;
+    T roi_end_h = offset_bottom_rois[4] * spatial_scale;
+    // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
+    // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
+    // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
+    // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
+    // Force malformed ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, (T)1.);
+    T roi_height = max(roi_end_h - roi_start_h, (T)1.);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+    T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width;
+    int top_offset    = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
+    int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+    for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix ++)
+      {
+        const T x = roi_start_w + pw * bin_size_w + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient(height, width, y, x,
+            w1, w2, w3, w4,
+            x_low, x_high, y_low, y_high,
+            index);
+        T g1 = top_diff_this_bin * w1 / count;
+        T g2 = top_diff_this_bin * w2 / count;
+        T g3 = top_diff_this_bin * w3 / count;
+        T g4 = top_diff_this_bin * w4 / count;
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0)
+        {
+          atomicAdd(offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
+          atomicAdd(offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
+          atomicAdd(offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
+          atomicAdd(offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  } // CUDA_1D_KERNEL_LOOP
+} // RoIAlignBackward
+at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
+                                 const at::Tensor& rois,
+                                 const float spatial_scale,
+                                 const int pooled_height,
+                                 const int pooled_width,
+                                 const int sampling_ratio) {
+  AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+  auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
+  dim3 block(512);
+  if (output.numel() == 0) {
+    THCudaCheck(cudaGetLastError());
+    return output;
+  }
+  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
+    RoIAlignForward<scalar_t><<<grid, block, 0, stream>>>(
+         output_size,
+         input.contiguous().data_ptr<scalar_t>(),
+         spatial_scale,
+         channels,
+         height,
+         width,
+         pooled_height,
+         pooled_width,
+         sampling_ratio,
+         rois.contiguous().data_ptr<scalar_t>(),
+         output.data_ptr<scalar_t>());
+  });
+  THCudaCheck(cudaGetLastError());
+  return output;
+}
+// TODO remove the dependency on input and use instead its sizes -> save memory
+at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
+                                  const at::Tensor& rois,
+                                  const float spatial_scale,
+                                  const int pooled_height,
+                                  const int pooled_width,
+                                  const int batch_size,
+                                  const int channels,
+                                  const int height,
+                                  const int width,
+                                  const int sampling_ratio) {
+  AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+  auto num_rois = rois.size(0);
+  auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
+  dim3 block(512);
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    THCudaCheck(cudaGetLastError());
+    return grad_input;
+  }
+  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] {
+    RoIAlignBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
+         grad.numel(),
+         grad.contiguous().data_ptr<scalar_t>(),
+         num_rois,
+         spatial_scale,
+         channels,
+         height,
+         width,
+         pooled_height,
+         pooled_width,
+         sampling_ratio,
+         grad_input.data_ptr<scalar_t>(),
+         rois.contiguous().data_ptr<scalar_t>());
+  });
+  THCudaCheck(cudaGetLastError());
+  return grad_input;
+}

maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu ADDED Viewed

	@@ -0,0 +1,202 @@

+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THC.h>
+#include <THC/THCAtomics.cuh>
+#include <THC/THCDeviceUtils.cuh>
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+template <typename T>
+__global__ void RoIPoolFForward(const int nthreads, const T* bottom_data,
+    const T spatial_scale, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const T* bottom_rois, T* top_data, int* argmax_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    int roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
+    int roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
+    int roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
+    int roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
+    // Force malformed ROIs to be 1x1
+    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+    T bin_size_h = static_cast<T>(roi_height)
+                       / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width)
+                       / static_cast<T>(pooled_width);
+    int hstart = static_cast<int>(floor(static_cast<T>(ph)
+                                        * bin_size_h));
+    int wstart = static_cast<int>(floor(static_cast<T>(pw)
+                                        * bin_size_w));
+    int hend = static_cast<int>(ceil(static_cast<T>(ph + 1)
+                                     * bin_size_h));
+    int wend = static_cast<int>(ceil(static_cast<T>(pw + 1)
+                                     * bin_size_w));
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart + roi_start_h, 0), height);
+    hend = min(max(hend + roi_start_h, 0), height);
+    wstart = min(max(wstart + roi_start_w, 0), width);
+    wend = min(max(wend + roi_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+    // Define an empty pooling region to be zero
+    T maxval = is_empty ? 0 : -FLT_MAX;
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+    int maxidx = -1;
+    const T* offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int bottom_index = h * width + w;
+        if (offset_bottom_data[bottom_index] > maxval) {
+          maxval = offset_bottom_data[bottom_index];
+          maxidx = bottom_index;
+        }
+      }
+    }
+    top_data[index] = maxval;
+    argmax_data[index] = maxidx;
+  }
+}
+template <typename T>
+__global__ void RoIPoolFBackward(const int nthreads, const T* top_diff,
+    const int* argmax_data, const int num_rois, const T spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, T* bottom_diff,
+    const T* bottom_rois) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    const T* offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    int bottom_offset = (roi_batch_ind * channels + c) * height * width;
+    int top_offset    = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_top_diff = top_diff + top_offset;
+    T* offset_bottom_diff = bottom_diff + bottom_offset;
+    const int* offset_argmax_data = argmax_data + top_offset;
+    int argmax = offset_argmax_data[ph * pooled_width + pw];
+    if (argmax != -1) {
+      atomicAdd(
+          offset_bottom_diff + argmax,
+          static_cast<T>(offset_top_diff[ph * pooled_width + pw]));
+    }
+  }
+}
+std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
+                                const at::Tensor& rois,
+                                const float spatial_scale,
+                                const int pooled_height,
+                                const int pooled_width) {
+  AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  auto height = input.size(2);
+  auto width = input.size(3);
+  auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
+  auto output_size = num_rois * pooled_height * pooled_width * channels;
+  auto argmax = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options().dtype(at::kInt));
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
+  dim3 block(512);
+  if (output.numel() == 0) {
+    THCudaCheck(cudaGetLastError());
+    return std::make_tuple(output, argmax);
+  }
+  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIPool_forward", [&] {
+    RoIPoolFForward<scalar_t><<<grid, block, 0, stream>>>(
+         output_size,
+         input.contiguous().data_ptr<scalar_t>(),
+         spatial_scale,
+         channels,
+         height,
+         width,
+         pooled_height,
+         pooled_width,
+         rois.contiguous().data_ptr<scalar_t>(),
+         output.data_ptr<scalar_t>(),
+         argmax.data_ptr<int>());
+  });
+  THCudaCheck(cudaGetLastError());
+  return std::make_tuple(output, argmax);
+}
+// TODO remove the dependency on input and use instead its sizes -> save memory
+at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
+                                 const at::Tensor& input,
+                                 const at::Tensor& rois,
+                                 const at::Tensor& argmax,
+                                 const float spatial_scale,
+                                 const int pooled_height,
+                                 const int pooled_width,
+                                 const int batch_size,
+                                 const int channels,
+                                 const int height,
+                                 const int width) {
+  AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
+  AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
+  // TODO add more checks
+  auto num_rois = rois.size(0);
+  auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
+  dim3 block(512);
+  // handle possibly empty gradients
+  if (grad.numel() == 0) {
+    THCudaCheck(cudaGetLastError());
+    return grad_input;
+  }
+  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIPool_backward", [&] {
+    RoIPoolFBackward<scalar_t><<<grid, block, 0, stream>>>(
+         grad.numel(),
+         grad.contiguous().data_ptr<scalar_t>(),
+         argmax.data_ptr<int>(),
+         num_rois,
+         spatial_scale,
+         channels,
+         height,
+         width,
+         pooled_height,
+         pooled_width,
+         grad_input.data_ptr<scalar_t>(),
+         rois.contiguous().data_ptr<scalar_t>());
+  });
+  THCudaCheck(cudaGetLastError());
+  return grad_input;
+}

maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu ADDED Viewed

	@@ -0,0 +1,188 @@

+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+// This file is modified from  https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu
+// Cheng-Yang Fu
+// cyfu@cs.unc.edu
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THC.h>
+#include <THC/THCAtomics.cuh>
+#include <THC/THCDeviceUtils.cuh>
+#include <cfloat>
+// TODO make it in a common file
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+template <typename T>
+__global__ void SigmoidFocalLossForward(const int nthreads,
+    const T* logits,
+    const int* targets,
+    const int num_classes,
+    const float gamma,
+    const float alpha,
+    const int num,
+    T* losses) {
+  CUDA_1D_KERNEL_LOOP(i, nthreads) {
+    int n = i / num_classes;
+    int d = i % num_classes; // current class[0~79];
+    int t = targets[n]; // target class [1~80];
+    // Decide it is positive or negative case.
+    T c1 = (t == (d+1));
+    T c2 = (t>=0 & t != (d+1));
+    T zn = (1.0 - alpha);
+    T zp = (alpha);
+    // p = 1. / 1. + expf(-x); p = sigmoid(x)
+    T  p = 1. / (1. + expf(-logits[i]));
+    // (1-p)**gamma * log(p) where
+    T term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN));
+    // p**gamma * log(1-p)
+    T term2 = powf(p, gamma) *
+            (-1. * logits[i] * (logits[i] >= 0) -
+             logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));
+    losses[i] = 0.0;
+    losses[i] += -c1 * term1 * zp;
+    losses[i] += -c2 * term2 * zn;
+  } // CUDA_1D_KERNEL_LOOP
+} // SigmoidFocalLossForward
+template <typename T>
+__global__ void SigmoidFocalLossBackward(const int nthreads,
+                const T* logits,
+                const int* targets,
+                const T* d_losses,
+                const int num_classes,
+                const float gamma,
+                const float alpha,
+                const int num,
+                T* d_logits) {
+  CUDA_1D_KERNEL_LOOP(i, nthreads) {
+    int n = i / num_classes;
+    int d = i % num_classes; // current class[0~79];
+    int t = targets[n]; // target class [1~80], 0 is background;
+    // Decide it is positive or negative case.
+    T c1 = (t == (d+1));
+    T c2 = (t>=0 & t != (d+1));
+    T zn = (1.0 - alpha);
+    T zp = (alpha);
+    // p = 1. / 1. + expf(-x); p = sigmoid(x)
+    T  p = 1. / (1. + expf(-logits[i]));
+    // (1-p)**g * (1 - p - g*p*log(p)
+    T term1 = powf((1. - p), gamma) *
+                      (1. - p - (p * gamma * logf(max(p, FLT_MIN))));
+    // (p**g) * (g*(1-p)*log(1-p) - p)
+    T term2 = powf(p, gamma) *
+                  ((-1. * logits[i] * (logits[i] >= 0) -
+                      logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
+                      (1. - p) * gamma - p);
+    d_logits[i] = 0.0;
+    d_logits[i] += -c1 * term1 * zp;
+    d_logits[i] += -c2 * term2 * zn;
+    d_logits[i] = d_logits[i] * d_losses[i];
+  } // CUDA_1D_KERNEL_LOOP
+} // SigmoidFocalLossBackward
+at::Tensor SigmoidFocalLoss_forward_cuda(
+		const at::Tensor& logits,
+                const at::Tensor& targets,
+		const int num_classes,
+		const float gamma,
+		const float alpha) {
+  AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
+  AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
+  AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
+  const int num_samples = logits.size(0);
+  auto losses = at::empty({num_samples, logits.size(1)}, logits.options());
+  auto losses_size = num_samples * logits.size(1);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 grid(std::min(THCCeilDiv(losses_size, 512L), 4096L));
+  dim3 block(512);
+  if (losses.numel() == 0) {
+    THCudaCheck(cudaGetLastError());
+    return losses;
+  }
+  AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_forward", [&] {
+    SigmoidFocalLossForward<scalar_t><<<grid, block, 0, stream>>>(
+         losses_size,
+         logits.contiguous().data_ptr<scalar_t>(),
+	 targets.contiguous().data_ptr<int>(),
+         num_classes,
+	 gamma,
+	 alpha,
+	 num_samples,
+         losses.data_ptr<scalar_t>());
+  });
+  THCudaCheck(cudaGetLastError());
+  return losses;
+}
+at::Tensor SigmoidFocalLoss_backward_cuda(
+		const at::Tensor& logits,
+                const at::Tensor& targets,
+		const at::Tensor& d_losses,
+		const int num_classes,
+		const float gamma,
+		const float alpha) {
+  AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
+  AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
+  AT_ASSERTM(d_losses.device().is_cuda(), "d_losses must be a CUDA tensor");
+  AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
+  const int num_samples = logits.size(0);
+  AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes");
+  auto d_logits = at::zeros({num_samples, num_classes}, logits.options());
+  auto d_logits_size = num_samples * logits.size(1);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 grid(std::min(THCCeilDiv(d_logits_size, 512L), 4096L));
+  dim3 block(512);
+  if (d_logits.numel() == 0) {
+    THCudaCheck(cudaGetLastError());
+    return d_logits;
+  }
+  AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_backward", [&] {
+    SigmoidFocalLossBackward<scalar_t><<<grid, block, 0, stream>>>(
+         d_logits_size,
+         logits.contiguous().data_ptr<scalar_t>(),
+	 targets.contiguous().data_ptr<int>(),
+	 d_losses.contiguous().data_ptr<scalar_t>(),
+         num_classes,
+	 gamma,
+	 alpha,
+	 num_samples,
+         d_logits.data_ptr<scalar_t>());
+  });
+  THCudaCheck(cudaGetLastError());
+  return d_logits;
+}

maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu ADDED Viewed

	@@ -0,0 +1,691 @@

+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THC.h>
+#include <THC/THCDeviceUtils.cuh>
+#include <vector>
+#include <iostream>
+#include <cmath>
+void deformable_im2col(const at::Tensor data_im, const at::Tensor data_offset,
+                       const int channels, const int height, const int width,
+                       const int ksize_h, const int ksize_w, const int pad_h,
+                       const int pad_w, const int stride_h, const int stride_w,
+                       const int dilation_h, const int dilation_w,
+                       const int parallel_imgs, const int deformable_group,
+                       at::Tensor data_col);
+void deformable_col2im(const at::Tensor data_col, const at::Tensor data_offset,
+                       const int channels, const int height, const int width,
+                       const int ksize_h, const int ksize_w, const int pad_h,
+                       const int pad_w, const int stride_h, const int stride_w,
+                       const int dilation_h, const int dilation_w,
+                       const int parallel_imgs, const int deformable_group,
+                       at::Tensor grad_im);
+void deformable_col2im_coord(
+    const at::Tensor data_col, const at::Tensor data_im,
+    const at::Tensor data_offset, const int channels, const int height,
+    const int width, const int ksize_h, const int ksize_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, at::Tensor grad_offset);
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im, const at::Tensor data_offset,
+    const at::Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kenerl_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    at::Tensor data_col);
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col, const at::Tensor data_offset,
+    const at::Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kenerl_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    at::Tensor grad_im);
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col, const at::Tensor data_im,
+    const at::Tensor data_offset, const at::Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, at::Tensor grad_offset,
+    at::Tensor grad_mask);
+void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
+                 at::Tensor weight, int kH, int kW, int dH, int dW, int padH,
+                 int padW, int dilationH, int dilationW, int group,
+                 int deformable_group)
+{
+  TORCH_CHECK(weight.ndimension() == 4,
+           "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
+           "but got: %s",
+           weight.ndimension());
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+  TORCH_CHECK(kW > 0 && kH > 0,
+           "kernel size should be greater than zero, but got kH: %d kW: %d", kH,
+           kW);
+  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+           "kernel size should be consistent with weight, ",
+           "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
+           kW, weight.size(2), weight.size(3));
+  TORCH_CHECK(dW > 0 && dH > 0,
+           "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+  TORCH_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH, dilationW);
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+  TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
+           ndim);
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  TORCH_CHECK(nInputPlane % deformable_group == 0,
+           "input channels must divide deformable group size");
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+        outputWidth);
+  TORCH_CHECK(input.size(1) == nInputPlane,
+           "invalid number of input planes, expected: %d, but got: %d",
+           nInputPlane, input.size(1));
+  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
+           "input image is smaller than kernel");
+  TORCH_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+           "invalid spatial size of offset, expected height: %d width: %d, but "
+           "got height: %d width: %d",
+           outputHeight, outputWidth, offset.size(2), offset.size(3));
+  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+           "invalid number of channels of offset");
+  if (gradOutput != NULL) {
+    TORCH_CHECK(gradOutput->size(dimf) == nOutputPlane,
+             "invalid number of gradOutput planes, expected: %d, but got: %d",
+             nOutputPlane, gradOutput->size(dimf));
+    TORCH_CHECK((gradOutput->size(dimh) == outputHeight &&
+              gradOutput->size(dimw) == outputWidth),
+             "invalid size of gradOutput, expected height: %d width: %d , but "
+             "got height: %d width: %d",
+             outputHeight, outputWidth, gradOutput->size(dimh),
+             gradOutput->size(dimw));
+  }
+}
+int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
+                             at::Tensor offset, at::Tensor output,
+                             at::Tensor columns, at::Tensor ones, int kW,
+                             int kH, int dW, int dH, int padW, int padH,
+                             int dilationW, int dilationH, int group,
+                             int deformable_group, int im2col_step)
+{
+  // todo: resize columns to include im2col: done
+  // todo: add im2col_step as input
+  // todo: add new output buffer and transpose it to output (or directly
+  // transpose output) todo: possibly change data indexing because of
+  // parallel_imgs
+  shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH, padW,
+              dilationH, dilationW, group, deformable_group);
+  input = input.contiguous();
+  offset = offset.contiguous();
+  weight = weight.contiguous();
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+  // todo: assert batchsize dividable by im2col_step
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+  long nOutputPlane = weight.size(0);
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
+                        outputHeight, outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  at::Tensor output_buffer =
+      at::zeros({batchSize / im2col_step, nOutputPlane,
+                 im2col_step * outputHeight, outputWidth},
+                output.options());
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), group, output_buffer.size(1) / group,
+       output_buffer.size(2), output_buffer.size(3)});
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                      dilationW, im2col_step, deformable_group, columns);
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+  }
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3), output_buffer.size(4)});
+  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
+                                      im2col_step, outputHeight, outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+  return 1;
+}
+int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
+                                    at::Tensor gradOutput, at::Tensor gradInput,
+                                    at::Tensor gradOffset, at::Tensor weight,
+                                    at::Tensor columns, int kW, int kH, int dW,
+                                    int dH, int padW, int padH, int dilationW,
+                                    int dilationH, int group,
+                                    int deformable_group, int im2col_step)
+{
+  shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW, padH, padW,
+              dilationH, dilationW, group, deformable_group);
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+  weight = weight.contiguous();
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+  long nOutputPlane = weight.size(0);
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+  // change order of grad output
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                              inputHeight, inputWidth});
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
+                                deformable_group * 2 * kH * kW, outputHeight,
+                                outputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), group, gradOutput.size(1) / group,
+         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
+    }
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
+    deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
+                            inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+                            dilationH, dilationW, im2col_step, deformable_group,
+                            gradOffset[elt]);
+    deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
+                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                      dilationW, im2col_step, deformable_group, gradInput[elt]);
+  }
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+  return 1;
+}
+int deform_conv_backward_parameters_cuda(
+    at::Tensor input, at::Tensor offset, at::Tensor gradOutput,
+    at::Tensor gradWeight,  // at::Tensor gradBias,
+    at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH,
+    int padW, int padH, int dilationW, int dilationH, int group,
+    int deformable_group, float scale, int im2col_step)
+{
+  // todo: transpose and reshape outGrad
+  // todo: reshape columns
+  // todo: add im2col_step as input
+  shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH, dW, padH,
+              padW, dilationH, dilationW, group, deformable_group);
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+  long nOutputPlane = gradWeight.size(0);
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+  at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+                             outputHeight, outputWidth});
+  gradOutputBuffer.copy_(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
+                             im2col_step * outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                      dilationW, im2col_step, deformable_group, columns);
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight =
+        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
+                         gradWeight.size(2), gradWeight.size(3)});
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
+                                  columns[g].transpose(1, 0), 1.0, scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2), gradWeight.size(3),
+                                  gradWeight.size(4)});
+  }
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+  return 1;
+}
+void modulated_deform_conv_cuda_forward(
+    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
+    at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns,
+    int kernel_h, int kernel_w, const int stride_h, const int stride_w,
+    const int pad_h, const int pad_w, const int dilation_h,
+    const int dilation_w, const int group, const int deformable_group,
+    const bool with_bias)
+{
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
+             channels, channels_kernel * group);
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns =
+      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
+                input.options());
+  output = output.view({output.size(0), group, output.size(1) / group,
+                        output.size(2), output.size(3)});
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_cuda(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+    // divide into group
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+  output = output.view({output.size(0), output.size(1) * output.size(2),
+                        output.size(3), output.size(4)});
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+void modulated_deform_conv_cuda_backward(
+    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
+    at::Tensor offset, at::Tensor mask, at::Tensor columns,
+    at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias,
+    at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias)
+{
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
+             channels, channels_kernel * group);
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
+                      input.options());
+  grad_output =
+      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
+                        grad_output.size(2), grad_output.size(3)});
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
+    }
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_cuda(
+        columns, input[b], offset[b], mask[b], 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_cuda(
+        columns, offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, grad_input[b]);
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_cuda(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
+                                    grad_weight.size(1), grad_weight.size(2),
+                                    grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+                                    grad_weight.size(2), grad_weight.size(3),
+                                    grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+                                  grad_output.size(2), grad_output.size(3),
+                                  grad_output.size(4)});
+}