zdou0830 commited on
Commit
749745d
1 Parent(s): b4c3cb2
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. MODEL/desco_glip_tiny.pth +3 -0
  2. app.py +7 -4
  3. coco_000000281759.jpg +275 -0
  4. configs/flickr/test.yaml +22 -0
  5. configs/flickr/val.yaml +22 -0
  6. configs/lvis/minival.yaml +30 -0
  7. configs/omnilabel/omnilabel_val_eval.yaml +18 -0
  8. configs/pretrain/_coco.yaml +3 -0
  9. configs/pretrain/fiber_cc.yaml +144 -0
  10. configs/pretrain/fiber_tiny.yaml +157 -0
  11. configs/pretrain/fiber_tiny_lr.yaml +157 -0
  12. configs/pretrain/fibert_flickr_only.yaml +157 -0
  13. configs/pretrain/glip_Swin_Flickr.yaml +116 -0
  14. configs/pretrain/glip_Swin_L.yaml +120 -0
  15. configs/pretrain/glip_Swin_T_O365.yaml +102 -0
  16. configs/pretrain/glip_Swin_T_O365_GoldG.yaml +132 -0
  17. configs/pretrain/glip_Swin_T_O365_GoldG_description.yaml +112 -0
  18. configs/pretrain/glip_Swin_T_cc.yaml +116 -0
  19. configs/pretrain/glip_Swin_T_cc_augv3.yaml +126 -0
  20. configs/pretrain/glip_Swin_T_coco.yaml +100 -0
  21. configs/pretrain/glip_Swing_T_flickr.yaml +116 -0
  22. configs/pretrain/glip_large.yaml +120 -0
  23. configs/pretrain/mixed_nococo_flickr_objects365.yaml +162 -0
  24. configs/pretrain/mixed_nococo_flickr_objects365_refexpclean.yaml +162 -0
  25. configs/pretrain_new/desco_fiber.yaml +168 -0
  26. configs/pretrain_new/desco_glip.yaml +134 -0
  27. configs/refcoco.yaml +116 -0
  28. configs/refcocog.yaml +116 -0
  29. configs/refcocoplus.yaml +116 -0
  30. configs/refexp/_refcoco+_testA.yaml +30 -0
  31. configs/refexp/_refcoco+_testB.yaml +30 -0
  32. configs/refexp/_refcoco_testA.yaml +30 -0
  33. configs/refexp/_refcoco_testB.yaml +30 -0
  34. configs/refexp/_refcocog_test.yaml +30 -0
  35. docs/intro.md +287 -0
  36. maskrcnn_benchmark/__init__.py +1 -0
  37. maskrcnn_benchmark/config/__init__.py +3 -0
  38. maskrcnn_benchmark/config/defaults.py +982 -0
  39. maskrcnn_benchmark/config/paths_catalog.py +779 -0
  40. maskrcnn_benchmark/csrc/ROIAlign.h +46 -0
  41. maskrcnn_benchmark/csrc/ROIPool.h +48 -0
  42. maskrcnn_benchmark/csrc/SigmoidFocalLoss.h +41 -0
  43. maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp +257 -0
  44. maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp +75 -0
  45. maskrcnn_benchmark/csrc/cpu/soft_nms.cpp +117 -0
  46. maskrcnn_benchmark/csrc/cpu/vision.h +22 -0
  47. maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu +346 -0
  48. maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu +202 -0
  49. maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu +188 -0
  50. maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu +691 -0
MODEL/desco_glip_tiny.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:199479f67b5fbd4ab5e232c8fa8df3e9ab42a96966a023524c6cd95710ea5192
3
+ size 3707483035
app.py CHANGED
@@ -19,9 +19,12 @@ from maskrcnn_benchmark.config import cfg
19
  from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo
20
 
21
  # Use this command for evaluate the GLIP-T model
22
- config_file = "configs/pretrain/glip_Swin_T_O365_GoldG.yaml"
23
  #weight_file = "MODEL/glip_tiny_model_o365_goldg_cc_sbu.pth"
24
 
 
 
 
25
  # Use this command if you want to try the GLIP-L model
26
  # ! wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/models/glip_large_model.pth -O MODEL/glip_large_model.pth
27
  # config_file = "configs/pretrain/glip_Swin_L.yaml"
@@ -61,12 +64,12 @@ gr.Interface(
61
  ),
62
  ],
63
  examples=[
64
- ["./flickr_9472793441.jpg", "bobble heads on top of the shelf ."],
65
- ["./flickr_9472793441.jpg", "sofa . remote . dog . person . car . sky . plane ."],
66
  ["./coco_000000281759.jpg", "A green umbrella. A pink striped umbrella. A plain white umbrella."],
67
  ["./coco_000000281759.jpg", "a flowery top. A blue dress. An orange shirt ."],
68
  ["./coco_000000281759.jpg", "a car . An electricity box ."],
69
- ["./flickr_7520721.jpg", "A woman figure skater in a blue costume holds her leg by the blade of her skate ."]
70
  ],
71
  article=Path("docs/intro.md").read_text()
72
  ).launch()
 
19
  from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo
20
 
21
  # Use this command for evaluate the GLIP-T model
22
+ #config_file = "configs/pretrain/glip_Swin_T_O365_GoldG.yaml"
23
  #weight_file = "MODEL/glip_tiny_model_o365_goldg_cc_sbu.pth"
24
 
25
+ config_file = "configs/pretrain_new/desco_glip.yaml"
26
+ weight_file = "MODEL/desco_glip_tiny.pth"
27
+
28
  # Use this command if you want to try the GLIP-L model
29
  # ! wget https://penzhanwu2bbs.blob.core.windows.net/data/GLIPv1_Open/models/glip_large_model.pth -O MODEL/glip_large_model.pth
30
  # config_file = "configs/pretrain/glip_Swin_L.yaml"
 
64
  ),
65
  ],
66
  examples=[
67
+ #["./flickr_9472793441.jpg", "bobble heads on top of the shelf ."],
68
+ #["./flickr_9472793441.jpg", "sofa . remote . dog . person . car . sky . plane ."],
69
  ["./coco_000000281759.jpg", "A green umbrella. A pink striped umbrella. A plain white umbrella."],
70
  ["./coco_000000281759.jpg", "a flowery top. A blue dress. An orange shirt ."],
71
  ["./coco_000000281759.jpg", "a car . An electricity box ."],
72
+ #["./flickr_7520721.jpg", "A woman figure skater in a blue costume holds her leg by the blade of her skate ."]
73
  ],
74
  article=Path("docs/intro.md").read_text()
75
  ).launch()
coco_000000281759.jpg ADDED
configs/flickr/test.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ ATSS:
3
+ NUM_CLASSES: 8 # Placeholder
4
+ FCOS:
5
+ NUM_CLASSES: 8 # Placeholder
6
+ ROI_BOX_HEAD:
7
+ NUM_CLASSES: 8 # Placeholder
8
+ DYHEAD:
9
+ NUM_CLASSES: 8 # Placeholder
10
+ DATASETS:
11
+ TRAIN: ("flickr30k_test", )
12
+ TEST: ("flickr30k_test", )
13
+ FLICKR_GT_TYPE: "separate"
14
+
15
+ INPUT:
16
+ MIN_SIZE_TRAIN: 800
17
+ MAX_SIZE_TRAIN: 1333
18
+ MIN_SIZE_TEST: 800
19
+ MAX_SIZE_TEST: 1333
20
+ DATALOADER:
21
+ SIZE_DIVISIBILITY: 32
22
+ ASPECT_RATIO_GROUPING: False
configs/flickr/val.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ ATSS:
3
+ NUM_CLASSES: 8 # Placeholder
4
+ FCOS:
5
+ NUM_CLASSES: 8 # Placeholder
6
+ ROI_BOX_HEAD:
7
+ NUM_CLASSES: 8 # Placeholder
8
+ DYHEAD:
9
+ NUM_CLASSES: 8 # Placeholder
10
+ DATASETS:
11
+ TRAIN: ("flickr30k_val", )
12
+ TEST: ("flickr30k_val", )
13
+ FLICKR_GT_TYPE: "separate"
14
+
15
+ INPUT:
16
+ MIN_SIZE_TRAIN: 800
17
+ MAX_SIZE_TRAIN: 1333
18
+ MIN_SIZE_TEST: 800
19
+ MAX_SIZE_TEST: 1333
20
+ DATALOADER:
21
+ SIZE_DIVISIBILITY: 32
22
+ ASPECT_RATIO_GROUPING: False
configs/lvis/minival.yaml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ ATSS:
3
+ NUM_CLASSES: 8 # these fields are not used; just a placeholder
4
+ FCOS:
5
+ NUM_CLASSES: 8
6
+ ROI_BOX_HEAD:
7
+ NUM_CLASSES: 8
8
+ DYHEAD:
9
+ NUM_CLASSES: 8
10
+ DATASETS:
11
+ REGISTER:
12
+ lvis_evaluation_mini_val:
13
+ img_dir: "coco"
14
+ ann_file: "coco/annotations/lvis_v1_minival_inserted_image_name.json"
15
+ lvis_evaluation_val:
16
+ img_dir: "coco"
17
+ ann_file: "coco/annotations/lvis_od_val.json"
18
+ TRAIN: ("lvis_evaluation_mini_val",)
19
+ TEST: ("lvis_evaluation_mini_val",)
20
+
21
+ INPUT:
22
+ MIN_SIZE_TRAIN: 800
23
+ MAX_SIZE_TRAIN: 1333
24
+ MIN_SIZE_TEST: 800
25
+ MAX_SIZE_TEST: 1333
26
+ DATALOADER:
27
+ SIZE_DIVISIBILITY: 32
28
+ ASPECT_RATIO_GROUPING: False
29
+ TEST:
30
+ IMS_PER_BATCH: 8
configs/omnilabel/omnilabel_val_eval.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ DATASETS:
2
+ REGISTER:
3
+ omnilabel_val_lvis_minival:
4
+ img_dir: "coco/"
5
+ ann_file: "coco/annotations/lvis_v1.description_omni.json"
6
+ omnilabel_val_lvis_selected:
7
+ img_dir: "coco/"
8
+ ann_file: "coco/annotations/lvis_v1.description_omni.selected.json"
9
+ omnilabel_val_lvis_auto:
10
+ img_dir: "coco/"
11
+ ann_file: "coco/annotations/lvis_v1.description_omni.auto.json"
12
+ omnilabel_val_flickr:
13
+ img_dir: "flickr30k/flickr30k_images/val/"
14
+ ann_file: "mdetr_annotations/final_flickr_separateGT_val.v1.25-0.omnilabel.json"
15
+ TEST: ("omnilabel_val",)
16
+ # TEST: ("omnilabel_val_coco",)
17
+ DATALOADER:
18
+ ASPECT_RATIO_GROUPING: False
configs/pretrain/_coco.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ DATASETS:
2
+ TRAIN: ("coco_2017_train",)
3
+ TEST: ("coco_2017_val", )
configs/pretrain/fiber_cc.yaml ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "MODEL/swin_base_patch4_window7_224.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ FUSION_VERSION: "v2"
9
+ CONV_BODY: "SWINT-FPN-RETINANET"
10
+ OUT_CHANNELS: 256
11
+
12
+ SWINT:
13
+ VERSION: "fusion"
14
+ EMBED_DIM: 128
15
+ DEPTHS: (2, 2, 18, 2)
16
+ NUM_HEADS: (4, 8, 16, 32)
17
+ WINDOW_SIZE: 12
18
+ OUT_CHANNELS: (128, 256, 512, 1024)
19
+ DROP_PATH_RATE: 0.4
20
+
21
+ LANGUAGE_BACKBONE:
22
+ FREEZE: False
23
+ MODEL_TYPE: "roberta-fused-v2"
24
+ MASK_SPECIAL: False
25
+ TOKENIZER_TYPE: "roberta-base"
26
+ USE_CHECKPOINT: False
27
+
28
+ RPN:
29
+ USE_FPN: True
30
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
31
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
32
+ ASPECT_RATIOS: (1.0,)
33
+ SCALES_PER_OCTAVE: 1
34
+
35
+ DYHEAD:
36
+ CHANNELS: 256
37
+ NUM_CONVS: 6
38
+ USE_GN: True
39
+ USE_DYRELU: True
40
+ USE_DFCONV: True
41
+ USE_DYFUSE: True
42
+ TOPK: 9 # topk for selecting candidate positive samples from each level
43
+ SCORE_AGG: "MEAN"
44
+ LOG_SCALE: 0.0
45
+
46
+ USE_CHECKPOINT: True
47
+ FUSE_CONFIG:
48
+ USE_FUSED_FEATURES_DOT_PRODUCT: False
49
+ EARLY_FUSE_ON: False
50
+ TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
51
+ USE_CLASSIFICATION_LOSS: False
52
+ USE_TOKEN_LOSS: False
53
+ USE_CONTRASTIVE_ALIGN_LOSS: False
54
+ CONTRASTIVE_HIDDEN_DIM: 64
55
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
56
+ USE_LAYER_SCALE: True
57
+ CLAMP_MIN_FOR_UNDERFLOW: True
58
+ CLAMP_MAX_FOR_OVERFLOW: True
59
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
60
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
61
+ CLAMP_DOT_PRODUCT: True
62
+
63
+ DATASETS:
64
+ REGISTER:
65
+ bing_caption_train:
66
+ yaml_path: "GCC/CC3M/yamls"
67
+ yaml_name: "tiny.noun.harsh"
68
+ yaml_name_no_coco: "tiny.noun.harsh"
69
+
70
+ # PREDOWNLOAD_BING : True
71
+ # PREDOWNLOAD_WITH_AZCOPY : True
72
+
73
+ CAPTION_CONF: 0.4
74
+ CAPTION_AUGMENTATION_VERSION: "v3.v1"
75
+ CAPTION_VOCAB_FILE: "tools/files/mixed_vocab.v1.tmp0.davincci.chunk1of1.filtered.json"
76
+ DESCRIPTION_FILE: "tools/files/o365.description.v1.json"
77
+
78
+ TRAIN: ("mixed_train_no_coco", "flickr30k_train", "object365_dt_train", "bing_caption_train_no_coco")
79
+ # TRAIN: ("bing_caption_train", "mixed_train", "flickr30k_train", "coco_grounding_train", )
80
+ TEST: ("coco_2017_val", )
81
+ BING_INDEX_LIST: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
82
+ # BING_INDEX_LIST: [ 0, 1, ]
83
+ ONE_HOT: False
84
+ FLICKR_COPY: 2
85
+ MIXED_COPY: 2
86
+ OBJECT365_COPY: 2
87
+ DISABLE_SHUFFLE: False
88
+ ADD_DET_PROMPT: False
89
+ RANDOM_SAMPLE_NEG: 85
90
+ CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
91
+ FURTHER_SCREEN: True
92
+
93
+ CAPTION_NMS: -1.0
94
+ CAPTION_MIN_BOX: 1
95
+
96
+ SEPARATION_TOKENS: ". "
97
+
98
+ PACK_RANDOM_CAPTION_NUMBER: 20
99
+ NO_RANDOM_PACK_PROBABILITY: 0.4
100
+ RANDOM_PACK_PROB: 0.5
101
+ CAPTION_FORMAT_VERSION: "v2"
102
+
103
+ INPUT:
104
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
105
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
106
+ MIN_SIZE_TRAIN: 800
107
+ MAX_SIZE_TRAIN: 1333
108
+ MIN_SIZE_TEST: 800
109
+ MAX_SIZE_TEST: 1333
110
+
111
+ AUGMENT:
112
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
113
+
114
+ DATALOADER:
115
+ SIZE_DIVISIBILITY: 32
116
+ DISTRIBUTE_CHUNK_AMONG_NODE: False
117
+
118
+ SOLVER:
119
+ OPTIMIZER: ADAMW
120
+ BASE_LR: 0.0001
121
+ LANG_LR: 0.00001
122
+ WEIGHT_DECAY: 0.01
123
+ WEIGHT_DECAY_SCHEDULE: True
124
+ STEPS: (0.67, 0.89)
125
+ MAX_ITER: 235026
126
+ IMS_PER_BATCH: 64
127
+ WARMUP_ITERS: 2000
128
+ WARMUP_FACTOR: 0.001
129
+ TEST_WITH_INFERENCE: True
130
+ FIND_UNUSED_PARAMETERS: False
131
+ USE_AMP: True
132
+ MODEL_EMA: 0.999
133
+ CHECKPOINT_PERIOD: 2500
134
+
135
+
136
+ CLIP_GRADIENTS:
137
+ ENABLED: True
138
+ CLIP_TYPE: "full_model"
139
+ CLIP_VALUE: 1.0
140
+ NORM_TYPE: 2.0
141
+
142
+ TEST:
143
+ DURING_TRAINING: False
144
+ IMS_PER_BATCH: 64
configs/pretrain/fiber_tiny.yaml ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ FUSION_VERSION: "v2"
9
+ CONV_BODY: "SWINT-FPN-RETINANET"
10
+ OUT_CHANNELS: 256
11
+
12
+ SWINT:
13
+ VERSION: "fusion"
14
+
15
+ LANGUAGE_BACKBONE:
16
+ FREEZE: False
17
+ MODEL_TYPE: "roberta-fused-tiny"
18
+ MASK_SPECIAL: False
19
+ TOKENIZER_TYPE: "roberta-base"
20
+ USE_CHECKPOINT: False
21
+
22
+ RPN:
23
+ USE_FPN: True
24
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
25
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
26
+ ASPECT_RATIOS: (1.0,)
27
+ SCALES_PER_OCTAVE: 1
28
+
29
+ DYHEAD:
30
+ CHANNELS: 256
31
+ NUM_CONVS: 6
32
+ USE_GN: True
33
+ USE_DYRELU: True
34
+ USE_DFCONV: True
35
+ USE_DYFUSE: True
36
+ TOPK: 9 # topk for selecting candidate positive samples from each level
37
+ SCORE_AGG: "MEAN"
38
+ LOG_SCALE: 0.0
39
+
40
+ USE_CHECKPOINT: False
41
+ FUSE_CONFIG:
42
+ USE_FUSED_FEATURES_DOT_PRODUCT: False
43
+ EARLY_FUSE_ON: False
44
+ TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
45
+ USE_CLASSIFICATION_LOSS: False
46
+ USE_TOKEN_LOSS: False
47
+ USE_CONTRASTIVE_ALIGN_LOSS: False
48
+ CONTRASTIVE_HIDDEN_DIM: 64
49
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
50
+ USE_LAYER_SCALE: True
51
+ CLAMP_MIN_FOR_UNDERFLOW: True
52
+ CLAMP_MAX_FOR_OVERFLOW: True
53
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
54
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
55
+ CLAMP_DOT_PRODUCT: True
56
+
57
+ DATASETS:
58
+ TRAIN: ("mixed_train_no_coco", "flickr30k_train", "object365_dt_train", )
59
+ TEST: ("coco_2017_val", )
60
+ ADD_DET_PROMPT: False
61
+ ADD_DET_PROMPT_ADVANCED: False
62
+ ALTERNATIVE_TRAINING: False
63
+ BOX_THRESHOLD: 0.1
64
+ CAPTION_CONF: 0.9
65
+ CAPTION_FORMAT_VERSION: "v2"
66
+ CAPTION_MIN_BOX: 1
67
+ CAPTION_NMS: 0.9
68
+ CLASS_AGNOSTIC: False
69
+ CLASS_CONCAT: False
70
+ COCO_COPY: 1
71
+ #CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
72
+ CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
73
+ DISABLE_CLIP_TO_IMAGE: False
74
+ DISABLE_SHUFFLE: False
75
+ FEW_SHOT: 0
76
+ FLICKR_COPY: 1
77
+ FLICKR_GT_TYPE: "separate"
78
+ FULL_QUESTION_PROB: 0.5
79
+ FURTHER_SCREEN: False
80
+ GENERAL_COPY: -1
81
+ GENERAL_COPY_TEST: -1
82
+ INFERENCE_CAPTION: False
83
+ IN_COPY: 1
84
+ LOCAL_DEBUG: False
85
+ LVIS_COPY: 1
86
+ LVIS_USE_NORMAL_AP: False
87
+ MAX_BOX: -1
88
+ MIXED_COPY: 1
89
+ MULTISTAGE_TRAINING: False
90
+ NEG_QUESTION_PROB: 0.8
91
+ NO_MINUS_ONE_FOR_ONE_HOT: False
92
+ OBJECT365_COPY: 1
93
+ OI_COPY: 1
94
+ ONE_HOT: False
95
+ PACK_RANDOM_CAPTION_NUMBER: 0
96
+ POS_QUESTION_PROB: 0.6
97
+ PREDOWNLOAD_BING: False
98
+ PREDOWNLOAD_WITH_AZCOPY: False
99
+ PROMPT_LIMIT_NEG: -1
100
+ RANDOM_SAMPLE_NEG: 85
101
+
102
+ REPLACE_CLEAN_LABEL: False
103
+ SAFEGUARD_POSITIVE_CAPTION: True
104
+ SEPARATION_TOKENS: ". "
105
+ SHUFFLE_SEED: 0
106
+ TEST_DATASETNAME_SUFFIX: ""
107
+ TRAIN_DATASETNAME_SUFFIX: ""
108
+ USE_CAPTION_PROMPT: False
109
+ USE_COCO_FORMAT: False
110
+ USE_CROWD: False
111
+ USE_OD_AUG: False
112
+ USE_OVERRIDE_CATEGORY: False
113
+ USE_SUPRESS_QUERY: False
114
+ VG_COPY: 1
115
+
116
+ INPUT:
117
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
118
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
119
+ MIN_SIZE_TRAIN: 800
120
+ MAX_SIZE_TRAIN: 1333
121
+ MIN_SIZE_TEST: 800
122
+ MAX_SIZE_TEST: 1333
123
+
124
+ AUGMENT:
125
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
126
+
127
+ DATALOADER:
128
+ SIZE_DIVISIBILITY: 32
129
+ DISTRIBUTE_CHUNK_AMONG_NODE: False
130
+
131
+ SOLVER:
132
+ OPTIMIZER: ADAMW
133
+ BASE_LR: 0.0001
134
+ LANG_LR: 0.00001
135
+ WEIGHT_DECAY: 0.01
136
+ WEIGHT_DECAY_SCHEDULE: True
137
+ STEPS: (0.67, 0.89)
138
+ MAX_ITER: 800000
139
+ IMS_PER_BATCH: 64
140
+ WARMUP_ITERS: 2000
141
+ WARMUP_FACTOR: 0.001
142
+ TEST_WITH_INFERENCE: True
143
+ FIND_UNUSED_PARAMETERS: True
144
+ USE_AMP: True
145
+ MODEL_EMA: 0.999
146
+ CHECKPOINT_PERIOD: 2500
147
+
148
+
149
+ CLIP_GRADIENTS:
150
+ ENABLED: True
151
+ CLIP_TYPE: "full_model"
152
+ CLIP_VALUE: 1.0
153
+ NORM_TYPE: 2.0
154
+
155
+ TEST:
156
+ DURING_TRAINING: False
157
+ IMS_PER_BATCH: 64
configs/pretrain/fiber_tiny_lr.yaml ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ FUSION_VERSION: "v2"
9
+ CONV_BODY: "SWINT-FPN-RETINANET"
10
+ OUT_CHANNELS: 256
11
+
12
+ SWINT:
13
+ VERSION: "fusion"
14
+
15
+ LANGUAGE_BACKBONE:
16
+ FREEZE: False
17
+ MODEL_TYPE: "roberta-fused-tiny"
18
+ MASK_SPECIAL: False
19
+ TOKENIZER_TYPE: "roberta-base"
20
+ USE_CHECKPOINT: False
21
+
22
+ RPN:
23
+ USE_FPN: True
24
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
25
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
26
+ ASPECT_RATIOS: (1.0,)
27
+ SCALES_PER_OCTAVE: 1
28
+
29
+ DYHEAD:
30
+ CHANNELS: 256
31
+ NUM_CONVS: 6
32
+ USE_GN: True
33
+ USE_DYRELU: True
34
+ USE_DFCONV: True
35
+ USE_DYFUSE: True
36
+ TOPK: 9 # topk for selecting candidate positive samples from each level
37
+ SCORE_AGG: "MEAN"
38
+ LOG_SCALE: 0.0
39
+
40
+ USE_CHECKPOINT: False
41
+ FUSE_CONFIG:
42
+ USE_FUSED_FEATURES_DOT_PRODUCT: False
43
+ EARLY_FUSE_ON: False
44
+ TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
45
+ USE_CLASSIFICATION_LOSS: False
46
+ USE_TOKEN_LOSS: False
47
+ USE_CONTRASTIVE_ALIGN_LOSS: False
48
+ CONTRASTIVE_HIDDEN_DIM: 64
49
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
50
+ USE_LAYER_SCALE: True
51
+ CLAMP_MIN_FOR_UNDERFLOW: True
52
+ CLAMP_MAX_FOR_OVERFLOW: True
53
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
54
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
55
+ CLAMP_DOT_PRODUCT: True
56
+
57
+ DATASETS:
58
+ TRAIN: ("mixed_train_no_coco", "flickr30k_train", "object365_dt_train", )
59
+ TEST: ("coco_2017_val", )
60
+ ADD_DET_PROMPT: False
61
+ ADD_DET_PROMPT_ADVANCED: False
62
+ ALTERNATIVE_TRAINING: False
63
+ BOX_THRESHOLD: 0.1
64
+ CAPTION_CONF: 0.9
65
+ CAPTION_FORMAT_VERSION: "v2"
66
+ CAPTION_MIN_BOX: 1
67
+ CAPTION_NMS: 0.9
68
+ CLASS_AGNOSTIC: False
69
+ CLASS_CONCAT: False
70
+ COCO_COPY: 1
71
+ #CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
72
+ CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
73
+ DISABLE_CLIP_TO_IMAGE: False
74
+ DISABLE_SHUFFLE: False
75
+ FEW_SHOT: 0
76
+ FLICKR_COPY: 1
77
+ FLICKR_GT_TYPE: "separate"
78
+ FULL_QUESTION_PROB: 0.5
79
+ FURTHER_SCREEN: False
80
+ GENERAL_COPY: -1
81
+ GENERAL_COPY_TEST: -1
82
+ INFERENCE_CAPTION: False
83
+ IN_COPY: 1
84
+ LOCAL_DEBUG: False
85
+ LVIS_COPY: 1
86
+ LVIS_USE_NORMAL_AP: False
87
+ MAX_BOX: -1
88
+ MIXED_COPY: 1
89
+ MULTISTAGE_TRAINING: False
90
+ NEG_QUESTION_PROB: 0.8
91
+ NO_MINUS_ONE_FOR_ONE_HOT: False
92
+ OBJECT365_COPY: 1
93
+ OI_COPY: 1
94
+ ONE_HOT: False
95
+ PACK_RANDOM_CAPTION_NUMBER: 0
96
+ POS_QUESTION_PROB: 0.6
97
+ PREDOWNLOAD_BING: False
98
+ PREDOWNLOAD_WITH_AZCOPY: False
99
+ PROMPT_LIMIT_NEG: -1
100
+ RANDOM_SAMPLE_NEG: 85
101
+
102
+ REPLACE_CLEAN_LABEL: False
103
+ SAFEGUARD_POSITIVE_CAPTION: True
104
+ SEPARATION_TOKENS: ". "
105
+ SHUFFLE_SEED: 0
106
+ TEST_DATASETNAME_SUFFIX: ""
107
+ TRAIN_DATASETNAME_SUFFIX: ""
108
+ USE_CAPTION_PROMPT: False
109
+ USE_COCO_FORMAT: False
110
+ USE_CROWD: False
111
+ USE_OD_AUG: False
112
+ USE_OVERRIDE_CATEGORY: False
113
+ USE_SUPRESS_QUERY: False
114
+ VG_COPY: 1
115
+
116
+ INPUT:
117
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
118
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
119
+ MIN_SIZE_TRAIN: 800
120
+ MAX_SIZE_TRAIN: 1333
121
+ MIN_SIZE_TEST: 800
122
+ MAX_SIZE_TEST: 1333
123
+
124
+ AUGMENT:
125
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
126
+
127
+ DATALOADER:
128
+ SIZE_DIVISIBILITY: 32
129
+ DISTRIBUTE_CHUNK_AMONG_NODE: False
130
+
131
+ SOLVER:
132
+ OPTIMIZER: ADAMW
133
+ BASE_LR: 0.0001
134
+ LANG_LR: 0.00001
135
+ WEIGHT_DECAY: 0.01
136
+ WEIGHT_DECAY_SCHEDULE: True
137
+ STEPS: (0.67, 0.89)
138
+ MAX_ITER: 800000
139
+ IMS_PER_BATCH: 64
140
+ WARMUP_ITERS: 2000
141
+ WARMUP_FACTOR: 0.001
142
+ TEST_WITH_INFERENCE: True
143
+ FIND_UNUSED_PARAMETERS: True
144
+ USE_AMP: True
145
+ MODEL_EMA: 0.999
146
+ CHECKPOINT_PERIOD: 2500
147
+
148
+
149
+ CLIP_GRADIENTS:
150
+ ENABLED: True
151
+ CLIP_TYPE: "full_model"
152
+ CLIP_VALUE: 1.0
153
+ NORM_TYPE: 2.0
154
+
155
+ TEST:
156
+ DURING_TRAINING: False
157
+ IMS_PER_BATCH: 64
configs/pretrain/fibert_flickr_only.yaml ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ FUSION_VERSION: "v2"
9
+ CONV_BODY: "SWINT-FPN-RETINANET"
10
+ OUT_CHANNELS: 256
11
+
12
+ SWINT:
13
+ VERSION: "fusion"
14
+
15
+ LANGUAGE_BACKBONE:
16
+ FREEZE: False
17
+ MODEL_TYPE: "roberta-fused-tiny"
18
+ MASK_SPECIAL: False
19
+ TOKENIZER_TYPE: "roberta-base"
20
+ USE_CHECKPOINT: False
21
+
22
+ RPN:
23
+ USE_FPN: True
24
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
25
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
26
+ ASPECT_RATIOS: (1.0,)
27
+ SCALES_PER_OCTAVE: 1
28
+
29
+ DYHEAD:
30
+ CHANNELS: 256
31
+ NUM_CONVS: 6
32
+ USE_GN: True
33
+ USE_DYRELU: True
34
+ USE_DFCONV: True
35
+ USE_DYFUSE: True
36
+ TOPK: 9 # topk for selecting candidate positive samples from each level
37
+ SCORE_AGG: "MEAN"
38
+ LOG_SCALE: 0.0
39
+
40
+ USE_CHECKPOINT: False
41
+ FUSE_CONFIG:
42
+ USE_FUSED_FEATURES_DOT_PRODUCT: False
43
+ EARLY_FUSE_ON: False
44
+ TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
45
+ USE_CLASSIFICATION_LOSS: False
46
+ USE_TOKEN_LOSS: False
47
+ USE_CONTRASTIVE_ALIGN_LOSS: False
48
+ CONTRASTIVE_HIDDEN_DIM: 64
49
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
50
+ USE_LAYER_SCALE: True
51
+ CLAMP_MIN_FOR_UNDERFLOW: True
52
+ CLAMP_MAX_FOR_OVERFLOW: True
53
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
54
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
55
+ CLAMP_DOT_PRODUCT: True
56
+
57
+ DATASETS:
58
+ TRAIN: ("flickr30k_train", )
59
+ TEST: ("coco_2017_val", )
60
+ ADD_DET_PROMPT: False
61
+ ADD_DET_PROMPT_ADVANCED: False
62
+ ALTERNATIVE_TRAINING: False
63
+ BOX_THRESHOLD: 0.1
64
+ CAPTION_CONF: 0.9
65
+ CAPTION_FORMAT_VERSION: "v2"
66
+ CAPTION_MIN_BOX: 1
67
+ CAPTION_NMS: 0.9
68
+ CLASS_AGNOSTIC: False
69
+ CLASS_CONCAT: False
70
+ COCO_COPY: 1
71
+ #CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
72
+ CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
73
+ DISABLE_CLIP_TO_IMAGE: False
74
+ DISABLE_SHUFFLE: False
75
+ FEW_SHOT: 0
76
+ FLICKR_COPY: 1
77
+ FLICKR_GT_TYPE: "separate"
78
+ FULL_QUESTION_PROB: 0.5
79
+ FURTHER_SCREEN: False
80
+ GENERAL_COPY: -1
81
+ GENERAL_COPY_TEST: -1
82
+ INFERENCE_CAPTION: False
83
+ IN_COPY: 1
84
+ LOCAL_DEBUG: False
85
+ LVIS_COPY: 1
86
+ LVIS_USE_NORMAL_AP: False
87
+ MAX_BOX: -1
88
+ MIXED_COPY: 1
89
+ MULTISTAGE_TRAINING: False
90
+ NEG_QUESTION_PROB: 0.8
91
+ NO_MINUS_ONE_FOR_ONE_HOT: False
92
+ OBJECT365_COPY: 1
93
+ OI_COPY: 1
94
+ ONE_HOT: False
95
+ PACK_RANDOM_CAPTION_NUMBER: 0
96
+ POS_QUESTION_PROB: 0.6
97
+ PREDOWNLOAD_BING: False
98
+ PREDOWNLOAD_WITH_AZCOPY: False
99
+ PROMPT_LIMIT_NEG: -1
100
+ RANDOM_SAMPLE_NEG: 85
101
+
102
+ REPLACE_CLEAN_LABEL: False
103
+ SAFEGUARD_POSITIVE_CAPTION: True
104
+ SEPARATION_TOKENS: ". "
105
+ SHUFFLE_SEED: 0
106
+ TEST_DATASETNAME_SUFFIX: ""
107
+ TRAIN_DATASETNAME_SUFFIX: ""
108
+ USE_CAPTION_PROMPT: False
109
+ USE_COCO_FORMAT: False
110
+ USE_CROWD: False
111
+ USE_OD_AUG: False
112
+ USE_OVERRIDE_CATEGORY: False
113
+ USE_SUPRESS_QUERY: False
114
+ VG_COPY: 1
115
+
116
+ INPUT:
117
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
118
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
119
+ MIN_SIZE_TRAIN: 800
120
+ MAX_SIZE_TRAIN: 1333
121
+ MIN_SIZE_TEST: 800
122
+ MAX_SIZE_TEST: 1333
123
+
124
+ AUGMENT:
125
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
126
+
127
+ DATALOADER:
128
+ SIZE_DIVISIBILITY: 32
129
+ DISTRIBUTE_CHUNK_AMONG_NODE: False
130
+
131
+ SOLVER:
132
+ OPTIMIZER: ADAMW
133
+ BASE_LR: 0.0001
134
+ LANG_LR: 0.00001
135
+ WEIGHT_DECAY: 0.01
136
+ WEIGHT_DECAY_SCHEDULE: True
137
+ STEPS: (0.67, 0.89)
138
+ MAX_ITER: 800000
139
+ IMS_PER_BATCH: 64
140
+ WARMUP_ITERS: 2000
141
+ WARMUP_FACTOR: 0.001
142
+ TEST_WITH_INFERENCE: True
143
+ FIND_UNUSED_PARAMETERS: True
144
+ USE_AMP: True
145
+ MODEL_EMA: 0.999
146
+ CHECKPOINT_PERIOD: 2500
147
+
148
+
149
+ CLIP_GRADIENTS:
150
+ ENABLED: True
151
+ CLIP_TYPE: "full_model"
152
+ CLIP_VALUE: 1.0
153
+ NORM_TYPE: 2.0
154
+
155
+ TEST:
156
+ DURING_TRAINING: False
157
+ IMS_PER_BATCH: 64
configs/pretrain/glip_Swin_Flickr.yaml ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ CONV_BODY: "SWINT-FPN-RETINANET"
9
+ OUT_CHANNELS: 256
10
+
11
+ LANGUAGE_BACKBONE:
12
+ FREEZE: False
13
+ MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
14
+ MASK_SPECIAL: False
15
+
16
+ RPN:
17
+ USE_FPN: True
18
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
19
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
20
+ ASPECT_RATIOS: (1.0,)
21
+ SCALES_PER_OCTAVE: 1
22
+
23
+ DYHEAD:
24
+ CHANNELS: 256
25
+ NUM_CONVS: 6
26
+ USE_GN: True
27
+ USE_DYRELU: True
28
+ USE_DFCONV: True
29
+ USE_DYFUSE: True
30
+ TOPK: 9 # topk for selecting candidate positive samples from each level
31
+ SCORE_AGG: "MEAN"
32
+ LOG_SCALE: 0.0
33
+
34
+ FUSE_CONFIG:
35
+ EARLY_FUSE_ON: True
36
+ TYPE: "MHA-B" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
37
+ USE_CLASSIFICATION_LOSS: False
38
+ USE_TOKEN_LOSS: False
39
+ USE_CONTRASTIVE_ALIGN_LOSS: False
40
+ CONTRASTIVE_HIDDEN_DIM: 64
41
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
42
+ USE_LAYER_SCALE: True
43
+ CLAMP_MIN_FOR_UNDERFLOW: True
44
+ CLAMP_MAX_FOR_OVERFLOW: True
45
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
46
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
47
+ CLAMP_DOT_PRODUCT: True
48
+
49
+ # use for grounding model
50
+ DATASETS:
51
+ REGISTER:
52
+ bing_caption_train:
53
+ yaml_path: "GCC/CC3M/yamls"
54
+ yaml_name: "tiny"
55
+ yaml_name_no_coco: "tiny"
56
+
57
+ # PREDOWNLOAD_BING : True
58
+ # PREDOWNLOAD_WITH_AZCOPY : True
59
+
60
+ TRAIN: ("flickr30k_train", )
61
+ # TRAIN: ("bing_caption_train", "mixed_train", "flickr30k_train", "coco_grounding_train", )
62
+ TEST: ("coco_2017_val", )
63
+ # BING_INDEX_LIST: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
64
+ # BING_INDEX_LIST: [ 0, 1, ]
65
+ ONE_HOT: False
66
+ FLICKR_COPY: 1
67
+ MIXED_COPY: 1
68
+ OBJECT365_COPY: 1
69
+ DISABLE_SHUFFLE: False
70
+ ADD_DET_PROMPT: False
71
+ RANDOM_SAMPLE_NEG: 85
72
+ CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
73
+ FURTHER_SCREEN: True
74
+ CAPTION_CONF: 0.5
75
+ CAPTION_NMS: -1.0
76
+ CAPTION_MIN_BOX: 1
77
+
78
+ SEPARATION_TOKENS: ". "
79
+
80
+ PACK_RANDOM_CAPTION_NUMBER: 20
81
+ NO_RANDOM_PACK_PROBABILITY: 0.4
82
+ RANDOM_PACK_PROB: 0.5
83
+ CAPTION_FORMAT_VERSION: "v2"
84
+
85
+
86
+ INPUT:
87
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
88
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
89
+ MIN_SIZE_TRAIN: 800
90
+ MAX_SIZE_TRAIN: 1333
91
+ MIN_SIZE_TEST: 800
92
+ MAX_SIZE_TEST: 1333
93
+
94
+ AUGMENT:
95
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
96
+
97
+ DATALOADER:
98
+ SIZE_DIVISIBILITY: 32
99
+ DISTRIBUTE_CHUNK_AMONG_NODE: False
100
+
101
+ SOLVER:
102
+ OPTIMIZER: ADAMW
103
+ BASE_LR: 0.0001
104
+ LANG_LR: 0.00001
105
+ WEIGHT_DECAY: 0.0001
106
+ STEPS: (0.67, 0.89)
107
+ MAX_EPOCH: 12
108
+ IMS_PER_BATCH: 64
109
+ WARMUP_ITERS: 2000
110
+ WARMUP_FACTOR: 0.001
111
+
112
+ CLIP_GRADIENTS:
113
+ ENABLED: True
114
+ CLIP_TYPE: "full_model"
115
+ CLIP_VALUE: 1.0
116
+ NORM_TYPE: 2.0
configs/pretrain/glip_Swin_L.yaml ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "swin_large_patch4_window12_384_22k.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ CONV_BODY: "SWINT-FPN-RETINANET"
9
+ OUT_CHANNELS: 256
10
+
11
+ SWINT:
12
+ EMBED_DIM: 192
13
+ DEPTHS: (2, 2, 18, 2)
14
+ NUM_HEADS: (6, 12, 24, 48)
15
+ WINDOW_SIZE: 12
16
+ OUT_CHANNELS: (192, 384, 768, 1536)
17
+ DROP_PATH_RATE: 0.4
18
+
19
+ LANGUAGE_BACKBONE:
20
+ FREEZE: False
21
+ MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
22
+ MASK_SPECIAL: False
23
+
24
+ RPN:
25
+ USE_FPN: True
26
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
27
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
28
+ ASPECT_RATIOS: (1.0,)
29
+ SCALES_PER_OCTAVE: 1
30
+
31
+ DYHEAD:
32
+ CHANNELS: 256
33
+ NUM_CONVS: 8
34
+ USE_GN: True
35
+ USE_DYRELU: True
36
+ USE_DFCONV: True
37
+ USE_DYFUSE: True
38
+ TOPK: 9 # topk for selecting candidate positive samples from each level
39
+ SCORE_AGG: "MEAN"
40
+ LOG_SCALE: 0.0
41
+
42
+ USE_CHECKPOINT: True
43
+ FUSE_CONFIG:
44
+ USE_FUSED_FEATURES_DOT_PRODUCT: True
45
+ EARLY_FUSE_ON: True
46
+ TYPE: "MHA-B"
47
+ USE_CLASSIFICATION_LOSS: False
48
+ USE_TOKEN_LOSS: False
49
+ USE_CONTRASTIVE_ALIGN_LOSS: False
50
+ CONTRASTIVE_HIDDEN_DIM: 64
51
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
52
+ USE_LAYER_SCALE: True
53
+ CLAMP_MIN_FOR_UNDERFLOW: True
54
+ CLAMP_MAX_FOR_OVERFLOW: True
55
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
56
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
57
+ CLAMP_DOT_PRODUCT: True
58
+
59
+ DATASETS:
60
+
61
+ TRAIN: ("mixed_train_no_coco",) # Place holder dataset for now. To be updated in the next version
62
+ TEST: ("coco_2017_val", )
63
+
64
+ ONE_HOT: False
65
+ FLICKR_COPY: 8 # 0.15 * 8 = ~1.2M
66
+ MIXED_COPY: 4 # 0.6 * 4 = ~2.4M
67
+ OBJECT365_COPY: 2 # 1.4 * 2 = ~2.8M
68
+ VG_COPY: 3 # 0.4 * 3 = ~1.2M
69
+ IN_COPY: 2 # 0.67 * 2 = ~1.33M
70
+ OI_COPY: 1 # 2M * 1 = 2M
71
+
72
+ DISABLE_SHUFFLE: False
73
+ ADD_DET_PROMPT: False
74
+ RANDOM_SAMPLE_NEG: 85
75
+ CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
76
+ FURTHER_SCREEN: True
77
+ CAPTION_CONF: 0.5
78
+ CAPTION_NMS: -1.0
79
+ CAPTION_MIN_BOX: 1
80
+
81
+ SEPARATION_TOKENS: ". "
82
+
83
+ PACK_RANDOM_CAPTION_NUMBER: 20
84
+ NO_RANDOM_PACK_PROBABILITY: 0.4
85
+ RANDOM_PACK_PROB: 0.5
86
+ CAPTION_FORMAT_VERSION: "v2"
87
+
88
+ INPUT:
89
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
90
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
91
+ MIN_SIZE_TRAIN: 800
92
+ MAX_SIZE_TRAIN: 1333
93
+ MIN_SIZE_TEST: 800
94
+ MAX_SIZE_TEST: 1333
95
+
96
+ AUGMENT:
97
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
98
+
99
+ DATALOADER:
100
+ SIZE_DIVISIBILITY: 32
101
+
102
+ SOLVER:
103
+ OPTIMIZER: ADAMW
104
+ BASE_LR: 0.0001
105
+ LANG_LR: 0.00001
106
+ WEIGHT_DECAY: 0.01
107
+ WEIGHT_DECAY_SCHEDULE: True
108
+ STEPS: (0.67, 0.89)
109
+ MAX_ITER: 1000000
110
+ IMS_PER_BATCH: 64
111
+ WARMUP_ITERS: 2000
112
+ WARMUP_FACTOR: 0.001
113
+
114
+ FIND_UNUSED_PARAMETERS: False
115
+
116
+ CLIP_GRADIENTS:
117
+ ENABLED: True
118
+ CLIP_TYPE: "full_model"
119
+ CLIP_VALUE: 1.0
120
+ NORM_TYPE: 2.0
configs/pretrain/glip_Swin_T_O365.yaml ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "swin_tiny_patch4_window7_224.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ CONV_BODY: "SWINT-FPN-RETINANET"
9
+ OUT_CHANNELS: 256
10
+ FREEZE_CONV_BODY_AT: -1
11
+
12
+ LANGUAGE_BACKBONE:
13
+ FREEZE: False
14
+ MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
15
+ MASK_SPECIAL: False
16
+
17
+ RPN:
18
+ USE_FPN: True
19
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
20
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
21
+ ASPECT_RATIOS: (1.0,)
22
+ SCALES_PER_OCTAVE: 1
23
+
24
+ DYHEAD:
25
+ CHANNELS: 256
26
+ NUM_CONVS: 6
27
+ USE_GN: True
28
+ USE_DYRELU: True
29
+ USE_DFCONV: True
30
+ USE_DYFUSE: True
31
+ TOPK: 9 # topk for selecting candidate positive samples from each level
32
+ SCORE_AGG: "MEAN"
33
+ LOG_SCALE: 0.0
34
+
35
+ FUSE_CONFIG:
36
+ EARLY_FUSE_ON: True
37
+ TYPE: "MHA-B"
38
+ USE_CLASSIFICATION_LOSS: False
39
+ USE_TOKEN_LOSS: False
40
+ USE_CONTRASTIVE_ALIGN_LOSS: False
41
+ CONTRASTIVE_HIDDEN_DIM: 64
42
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
43
+ USE_FUSED_FEATURES_DOT_PRODUCT: True
44
+ USE_LAYER_SCALE: True
45
+ CLAMP_MIN_FOR_UNDERFLOW: True
46
+ CLAMP_MAX_FOR_OVERFLOW: True
47
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
48
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
49
+ CLAMP_DOT_PRODUCT: True
50
+
51
+ USE_CHECKPOINT: True
52
+
53
+ TEST:
54
+ DURING_TRAINING: False
55
+ IMS_PER_BATCH: 64
56
+
57
+ # use for grounding model
58
+ DATASETS:
59
+ TRAIN: ("object365_dt_train", )
60
+ TEST: ("coco_2017_val", )
61
+ DISABLE_SHUFFLE: False
62
+ ADD_DET_PROMPT: False
63
+ RANDOM_SAMPLE_NEG: 85
64
+ CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
65
+
66
+ DESCRIPTION_FILE: "DATASET/Objects365/descriptions/o365.description.v1.json"
67
+
68
+ SEPARATION_TOKENS: ". "
69
+
70
+ INPUT:
71
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
72
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
73
+ MIN_SIZE_TRAIN: 800
74
+ MAX_SIZE_TRAIN: 1333
75
+ MIN_SIZE_TEST: 800
76
+ MAX_SIZE_TEST: 1333
77
+
78
+ AUGMENT:
79
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
80
+
81
+ DATALOADER:
82
+ SIZE_DIVISIBILITY: 32
83
+
84
+ SOLVER:
85
+ OPTIMIZER: ADAMW
86
+ BASE_LR: 0.0001
87
+ LANG_LR: 0.00001
88
+ WEIGHT_DECAY: 0.0001
89
+ STEPS: (0.67, 0.89)
90
+ MAX_EPOCH: 30
91
+ IMS_PER_BATCH: 64
92
+ WARMUP_ITERS: 2000
93
+ WARMUP_FACTOR: 0.001
94
+ USE_AMP: True
95
+ MODEL_EMA: 0.999
96
+ FIND_UNUSED_PARAMETERS: False
97
+
98
+ CLIP_GRADIENTS:
99
+ ENABLED: True
100
+ CLIP_TYPE: "full_model"
101
+ CLIP_VALUE: 1.0
102
+ NORM_TYPE: 2.0
configs/pretrain/glip_Swin_T_O365_GoldG.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "swin_tiny_patch4_window7_224.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ CONV_BODY: "SWINT-FPN-RETINANET"
9
+ OUT_CHANNELS: 256
10
+ FREEZE_CONV_BODY_AT: -1
11
+
12
+ LANGUAGE_BACKBONE:
13
+ FREEZE: False
14
+ MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
15
+ MASK_SPECIAL: False
16
+
17
+ RPN:
18
+ USE_FPN: True
19
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
20
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
21
+ ASPECT_RATIOS: (1.0,)
22
+ SCALES_PER_OCTAVE: 1
23
+
24
+ DYHEAD:
25
+ CHANNELS: 256
26
+ NUM_CONVS: 6
27
+ USE_GN: True
28
+ USE_DYRELU: True
29
+ USE_DFCONV: True
30
+ USE_DYFUSE: True
31
+ TOPK: 9 # topk for selecting candidate positive samples from each level
32
+ SCORE_AGG: "MEAN"
33
+ LOG_SCALE: 0.0
34
+
35
+ FUSE_CONFIG:
36
+ EARLY_FUSE_ON: True
37
+ TYPE: "MHA-B"
38
+ USE_CLASSIFICATION_LOSS: False
39
+ USE_TOKEN_LOSS: False
40
+ USE_CONTRASTIVE_ALIGN_LOSS: False
41
+ CONTRASTIVE_HIDDEN_DIM: 64
42
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
43
+ USE_FUSED_FEATURES_DOT_PRODUCT: True
44
+ USE_LAYER_SCALE: True
45
+ CLAMP_MIN_FOR_UNDERFLOW: True
46
+ CLAMP_MAX_FOR_OVERFLOW: True
47
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
48
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
49
+ CLAMP_DOT_PRODUCT: True
50
+
51
+ USE_CHECKPOINT: True
52
+
53
+ TEST:
54
+ DURING_TRAINING: False
55
+ IMS_PER_BATCH: 64
56
+
57
+ # use for grounding model
58
+ DATASETS:
59
+ REGISTER:
60
+ mixed_train_no_coco_noun:
61
+ coco_img_dir: "coco/train2014"
62
+ vg_img_dir: "gqa/images"
63
+ ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns.json"
64
+ mixed_train_no_coco_gpt:
65
+ coco_img_dir: "coco/train2014"
66
+ vg_img_dir: "gqa/images"
67
+ ann_file: "mdetr_annotations/final_mixed_train_no_coco_gpt.v1.new.json"
68
+ flickr30k_train_gpt:
69
+ img_folder: "flickr30k/flickr30k_images/train"
70
+ ann_file: "mdetr_annotations/final_flickr_separateGT_train_gpt.v1.json"
71
+ is_train: True
72
+ mixed_train_no_coco_noun_gpt:
73
+ coco_img_dir: "coco/train2014"
74
+ vg_img_dir: "gqa/images"
75
+ ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.v1.json"
76
+ mixed_train_no_coco_noun_gpt_0422:
77
+ coco_img_dir: "coco/train2014"
78
+ vg_img_dir: "gqa/images"
79
+ ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.0422.json"
80
+ mixed_train_no_coco_noun_gpt_0425:
81
+ coco_img_dir: "coco/train2014"
82
+ vg_img_dir: "gqa/images"
83
+ ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.0425.json"
84
+ flickr30k_train_gpt_0425:
85
+ img_folder: "flickr30k/flickr30k_images/train"
86
+ ann_file: "mdetr_annotations/final_flickr_separateGT_train_gpt.0425.json"
87
+ is_train: True
88
+
89
+ TRAIN: ("object365_dt_train", "mixed_train_no_coco", "flickr30k_train", )
90
+ TEST: ("coco_2017_val", )
91
+ DISABLE_SHUFFLE: False
92
+ ADD_DET_PROMPT: False
93
+ RANDOM_SAMPLE_NEG: 85
94
+ CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
95
+
96
+ DESCRIPTION_FILE: "tools/files/o365.description.v1.json"
97
+ CAPTION_VOCAB_FILE: "tools/files/mixed_vocab.v1.tmp0.davincci.chunk1of1.json"
98
+ SEPARATION_TOKENS: ". "
99
+
100
+ INPUT:
101
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
102
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
103
+ MIN_SIZE_TRAIN: 800
104
+ MAX_SIZE_TRAIN: 1333
105
+ MIN_SIZE_TEST: 800
106
+ MAX_SIZE_TEST: 1333
107
+
108
+ AUGMENT:
109
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
110
+
111
+ DATALOADER:
112
+ SIZE_DIVISIBILITY: 32
113
+
114
+ SOLVER:
115
+ OPTIMIZER: ADAMW
116
+ BASE_LR: 0.0001
117
+ LANG_LR: 0.00001
118
+ WEIGHT_DECAY: 0.0001
119
+ STEPS: (0.67, 0.89)
120
+ MAX_EPOCH: 30
121
+ IMS_PER_BATCH: 64
122
+ WARMUP_ITERS: 2000
123
+ WARMUP_FACTOR: 0.001
124
+ USE_AMP: True
125
+ MODEL_EMA: 0.999
126
+ FIND_UNUSED_PARAMETERS: False
127
+
128
+ CLIP_GRADIENTS:
129
+ ENABLED: True
130
+ CLIP_TYPE: "full_model"
131
+ CLIP_VALUE: 1.0
132
+ NORM_TYPE: 2.0
configs/pretrain/glip_Swin_T_O365_GoldG_description.yaml ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "swin_tiny_patch4_window7_224.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ CONV_BODY: "SWINT-FPN-RETINANET"
9
+ OUT_CHANNELS: 256
10
+ FREEZE_CONV_BODY_AT: -1
11
+
12
+ LANGUAGE_BACKBONE:
13
+ FREEZE: False
14
+ MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
15
+ MASK_SPECIAL: False
16
+
17
+ RPN:
18
+ USE_FPN: True
19
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
20
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
21
+ ASPECT_RATIOS: (1.0,)
22
+ SCALES_PER_OCTAVE: 1
23
+
24
+ DYHEAD:
25
+ CHANNELS: 256
26
+ NUM_CONVS: 6
27
+ USE_GN: True
28
+ USE_DYRELU: True
29
+ USE_DFCONV: True
30
+ USE_DYFUSE: True
31
+ TOPK: 9 # topk for selecting candidate positive samples from each level
32
+ SCORE_AGG: "MEAN"
33
+ LOG_SCALE: 0.0
34
+
35
+ FUSE_CONFIG:
36
+ EARLY_FUSE_ON: True
37
+ TYPE: "MHA-B"
38
+ USE_CLASSIFICATION_LOSS: False
39
+ USE_TOKEN_LOSS: False
40
+ USE_CONTRASTIVE_ALIGN_LOSS: False
41
+ CONTRASTIVE_HIDDEN_DIM: 64
42
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
43
+ USE_FUSED_FEATURES_DOT_PRODUCT: True
44
+ USE_LAYER_SCALE: True
45
+ CLAMP_MIN_FOR_UNDERFLOW: True
46
+ CLAMP_MAX_FOR_OVERFLOW: True
47
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
48
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
49
+ CLAMP_DOT_PRODUCT: True
50
+
51
+ USE_CHECKPOINT: True
52
+
53
+ TEST:
54
+ DURING_TRAINING: False
55
+ IMS_PER_BATCH: 64
56
+
57
+ # use for grounding model
58
+ DATASETS:
59
+ REGISTER:
60
+ mixed_train_no_coco_noun:
61
+ coco_img_dir: "coco/train2014"
62
+ vg_img_dir: "gqa/images"
63
+ ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns.json"
64
+
65
+ TRAIN: ("object365_dt_train", "mixed_train_no_coco", "flickr30k_train", )
66
+ TEST: ("coco_2017_val", )
67
+ DISABLE_SHUFFLE: False
68
+ ADD_DET_PROMPT: False
69
+ RANDOM_SAMPLE_NEG: 85
70
+ CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
71
+
72
+ OD_TO_GROUNDING_VERSION: "description.gpt.v2.allow_zero"
73
+ CAPTION_AUGMENTATION_VERSION: "v3.v1"
74
+ CAPTION_VOCAB_FILE: "tools/files/mixed_vocab.v1.tmp0.davincci.chunk1of1.json"
75
+ DESCRIPTION_FILE: "tools/files/o365.description.v1.json"
76
+
77
+ SEPARATION_TOKENS: ". "
78
+
79
+ INPUT:
80
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
81
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
82
+ MIN_SIZE_TRAIN: 800
83
+ MAX_SIZE_TRAIN: 1333
84
+ MIN_SIZE_TEST: 800
85
+ MAX_SIZE_TEST: 1333
86
+
87
+ AUGMENT:
88
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
89
+
90
+ DATALOADER:
91
+ SIZE_DIVISIBILITY: 32
92
+
93
+ SOLVER:
94
+ OPTIMIZER: ADAMW
95
+ BASE_LR: 0.0001
96
+ LANG_LR: 0.00001
97
+ WEIGHT_DECAY: 0.0001
98
+ STEPS: (0.67, 0.89)
99
+ MAX_EPOCH: 30
100
+ IMS_PER_BATCH: 64
101
+ WARMUP_ITERS: 2000
102
+ WARMUP_FACTOR: 0.001
103
+ USE_AMP: True
104
+ MODEL_EMA: 0.999
105
+ FIND_UNUSED_PARAMETERS: False
106
+ MAX_NEG_PER_BATCH: 1.0
107
+
108
+ CLIP_GRADIENTS:
109
+ ENABLED: True
110
+ CLIP_TYPE: "full_model"
111
+ CLIP_VALUE: 1.0
112
+ NORM_TYPE: 2.0
configs/pretrain/glip_Swin_T_cc.yaml ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ CONV_BODY: "SWINT-FPN-RETINANET"
9
+ OUT_CHANNELS: 256
10
+
11
+ LANGUAGE_BACKBONE:
12
+ FREEZE: False
13
+ MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
14
+ MASK_SPECIAL: False
15
+
16
+ RPN:
17
+ USE_FPN: True
18
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
19
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
20
+ ASPECT_RATIOS: (1.0,)
21
+ SCALES_PER_OCTAVE: 1
22
+
23
+ DYHEAD:
24
+ CHANNELS: 256
25
+ NUM_CONVS: 6
26
+ USE_GN: True
27
+ USE_DYRELU: True
28
+ USE_DFCONV: True
29
+ USE_DYFUSE: True
30
+ TOPK: 9 # topk for selecting candidate positive samples from each level
31
+ SCORE_AGG: "MEAN"
32
+ LOG_SCALE: 0.0
33
+
34
+ FUSE_CONFIG:
35
+ EARLY_FUSE_ON: True
36
+ TYPE: "MHA-B" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
37
+ USE_CLASSIFICATION_LOSS: False
38
+ USE_TOKEN_LOSS: False
39
+ USE_CONTRASTIVE_ALIGN_LOSS: False
40
+ CONTRASTIVE_HIDDEN_DIM: 64
41
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
42
+ USE_LAYER_SCALE: True
43
+ CLAMP_MIN_FOR_UNDERFLOW: True
44
+ CLAMP_MAX_FOR_OVERFLOW: True
45
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
46
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
47
+ CLAMP_DOT_PRODUCT: True
48
+
49
+ # use for grounding model
50
+ DATASETS:
51
+ REGISTER:
52
+ bing_caption_train:
53
+ yaml_path: "GCC/CC3M/yamls"
54
+ yaml_name: "tiny"
55
+ yaml_name_no_coco: "tiny"
56
+
57
+ # PREDOWNLOAD_BING : True
58
+ # PREDOWNLOAD_WITH_AZCOPY : True
59
+
60
+ TRAIN: ("bing_caption_train_no_coco",)
61
+ # TRAIN: ("bing_caption_train", "mixed_train", "flickr30k_train", "coco_grounding_train", )
62
+ TEST: ("coco_2017_val", )
63
+ BING_INDEX_LIST: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
64
+ # BING_INDEX_LIST: [ 0, 1, ]
65
+ ONE_HOT: False
66
+ FLICKR_COPY: 4
67
+ MIXED_COPY: 4
68
+ OBJECT365_COPY: 2
69
+ DISABLE_SHUFFLE: False
70
+ ADD_DET_PROMPT: False
71
+ RANDOM_SAMPLE_NEG: 85
72
+ CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
73
+ FURTHER_SCREEN: True
74
+ CAPTION_CONF: 0.5
75
+ CAPTION_NMS: -1.0
76
+ CAPTION_MIN_BOX: 1
77
+
78
+ SEPARATION_TOKENS: ". "
79
+
80
+ PACK_RANDOM_CAPTION_NUMBER: 20
81
+ NO_RANDOM_PACK_PROBABILITY: 0.4
82
+ RANDOM_PACK_PROB: 0.5
83
+ CAPTION_FORMAT_VERSION: "v2"
84
+
85
+
86
+ INPUT:
87
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
88
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
89
+ MIN_SIZE_TRAIN: 800
90
+ MAX_SIZE_TRAIN: 1333
91
+ MIN_SIZE_TEST: 800
92
+ MAX_SIZE_TEST: 1333
93
+
94
+ AUGMENT:
95
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
96
+
97
+ DATALOADER:
98
+ SIZE_DIVISIBILITY: 32
99
+ DISTRIBUTE_CHUNK_AMONG_NODE: False
100
+
101
+ SOLVER:
102
+ OPTIMIZER: ADAMW
103
+ BASE_LR: 0.0001
104
+ LANG_LR: 0.00001
105
+ WEIGHT_DECAY: 0.0001
106
+ STEPS: (0.67, 0.89)
107
+ MAX_EPOCH: 12
108
+ IMS_PER_BATCH: 64
109
+ WARMUP_ITERS: 2000
110
+ WARMUP_FACTOR: 0.001
111
+
112
+ CLIP_GRADIENTS:
113
+ ENABLED: True
114
+ CLIP_TYPE: "full_model"
115
+ CLIP_VALUE: 1.0
116
+ NORM_TYPE: 2.0
configs/pretrain/glip_Swin_T_cc_augv3.yaml ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ CONV_BODY: "SWINT-FPN-RETINANET"
9
+ OUT_CHANNELS: 256
10
+
11
+ LANGUAGE_BACKBONE:
12
+ FREEZE: False
13
+ MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
14
+ MASK_SPECIAL: False
15
+
16
+ RPN:
17
+ USE_FPN: True
18
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
19
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
20
+ ASPECT_RATIOS: (1.0,)
21
+ SCALES_PER_OCTAVE: 1
22
+
23
+ DYHEAD:
24
+ CHANNELS: 256
25
+ NUM_CONVS: 6
26
+ USE_GN: True
27
+ USE_DYRELU: True
28
+ USE_DFCONV: True
29
+ USE_DYFUSE: True
30
+ TOPK: 9 # topk for selecting candidate positive samples from each level
31
+ SCORE_AGG: "MEAN"
32
+ LOG_SCALE: 0.0
33
+
34
+ FUSE_CONFIG:
35
+ EARLY_FUSE_ON: True
36
+ TYPE: "MHA-B" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
37
+ USE_CLASSIFICATION_LOSS: False
38
+ USE_TOKEN_LOSS: False
39
+ USE_CONTRASTIVE_ALIGN_LOSS: False
40
+ CONTRASTIVE_HIDDEN_DIM: 64
41
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
42
+ USE_FUSED_FEATURES_DOT_PRODUCT: True
43
+ USE_LAYER_SCALE: True
44
+ CLAMP_MIN_FOR_UNDERFLOW: True
45
+ CLAMP_MAX_FOR_OVERFLOW: True
46
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
47
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
48
+ CLAMP_DOT_PRODUCT: True
49
+
50
+ # use for grounding model
51
+ DATASETS:
52
+ REGISTER:
53
+ bing_caption_train:
54
+ yaml_path: "GCC/CC3M/yamls"
55
+ yaml_name: "tiny.noun.harsh"
56
+ yaml_name_no_coco: "tiny.noun.harsh"
57
+
58
+ # PREDOWNLOAD_BING : True
59
+ # PREDOWNLOAD_WITH_AZCOPY : True
60
+
61
+ CAPTION_CONF: 0.4
62
+ CAPTION_AUGMENTATION_VERSION: "v3.v1"
63
+ CAPTION_VOCAB_FILE: "tools/files/mixed_vocab.v1.tmp0.davincci.chunk1of1.filtered.json"
64
+ DESCRIPTION_FILE: "tools/files/o365.description.v1.json"
65
+
66
+ TRAIN: ("mixed_train_no_coco", "flickr30k_train", "object365_dt_train", "bing_caption_train_no_coco")
67
+ # TRAIN: ("bing_caption_train", "mixed_train", "flickr30k_train", "coco_grounding_train", )
68
+ TEST: ("coco_2017_val", )
69
+ BING_INDEX_LIST: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
70
+ # BING_INDEX_LIST: [ 0, 1, ]
71
+ ONE_HOT: False
72
+ FLICKR_COPY: 2
73
+ MIXED_COPY: 2
74
+ OBJECT365_COPY: 2
75
+ DISABLE_SHUFFLE: False
76
+ ADD_DET_PROMPT: False
77
+ RANDOM_SAMPLE_NEG: 85
78
+ CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
79
+ FURTHER_SCREEN: True
80
+
81
+ CAPTION_NMS: -1.0
82
+ CAPTION_MIN_BOX: 1
83
+
84
+ SEPARATION_TOKENS: ". "
85
+
86
+ PACK_RANDOM_CAPTION_NUMBER: 20
87
+ NO_RANDOM_PACK_PROBABILITY: 0.4
88
+ RANDOM_PACK_PROB: 0.5
89
+ CAPTION_FORMAT_VERSION: "v2"
90
+
91
+
92
+ INPUT:
93
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
94
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
95
+ MIN_SIZE_TRAIN: 800
96
+ MAX_SIZE_TRAIN: 1333
97
+ MIN_SIZE_TEST: 800
98
+ MAX_SIZE_TEST: 1333
99
+
100
+ AUGMENT:
101
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
102
+
103
+ DATALOADER:
104
+ SIZE_DIVISIBILITY: 32
105
+ DISTRIBUTE_CHUNK_AMONG_NODE: False
106
+
107
+ SOLVER:
108
+ OPTIMIZER: ADAMW
109
+ BASE_LR: 0.0001
110
+ LANG_LR: 0.00001
111
+ WEIGHT_DECAY: 0.0001
112
+ STEPS: (0.67, 0.89)
113
+ #MAX_EPOCH: 12
114
+ MAX_ITER: 235026
115
+ IMS_PER_BATCH: 64
116
+ WARMUP_ITERS: 2000
117
+ WARMUP_FACTOR: 0.001
118
+ USE_AMP: True
119
+ MODEL_EMA: 0.999
120
+ FIND_UNUSED_PARAMETERS: False
121
+
122
+ CLIP_GRADIENTS:
123
+ ENABLED: True
124
+ CLIP_TYPE: "full_model"
125
+ CLIP_VALUE: 1.0
126
+ NORM_TYPE: 2.0
configs/pretrain/glip_Swin_T_coco.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "swin_tiny_patch4_window7_224.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ CONV_BODY: "SWINT-FPN-RETINANET"
9
+ OUT_CHANNELS: 256
10
+ FREEZE_CONV_BODY_AT: -1
11
+
12
+ LANGUAGE_BACKBONE:
13
+ FREEZE: False
14
+ MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
15
+ MASK_SPECIAL: False
16
+
17
+ RPN:
18
+ USE_FPN: True
19
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
20
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
21
+ ASPECT_RATIOS: (1.0,)
22
+ SCALES_PER_OCTAVE: 1
23
+
24
+ DYHEAD:
25
+ CHANNELS: 256
26
+ NUM_CONVS: 6
27
+ USE_GN: True
28
+ USE_DYRELU: True
29
+ USE_DFCONV: True
30
+ USE_DYFUSE: True
31
+ TOPK: 9 # topk for selecting candidate positive samples from each level
32
+ SCORE_AGG: "MEAN"
33
+ LOG_SCALE: 0.0
34
+
35
+ FUSE_CONFIG:
36
+ EARLY_FUSE_ON: True
37
+ TYPE: "MHA-B"
38
+ USE_CLASSIFICATION_LOSS: False
39
+ USE_TOKEN_LOSS: False
40
+ USE_CONTRASTIVE_ALIGN_LOSS: False
41
+ CONTRASTIVE_HIDDEN_DIM: 64
42
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
43
+ USE_FUSED_FEATURES_DOT_PRODUCT: True
44
+ USE_LAYER_SCALE: True
45
+ CLAMP_MIN_FOR_UNDERFLOW: True
46
+ CLAMP_MAX_FOR_OVERFLOW: True
47
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
48
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
49
+ CLAMP_DOT_PRODUCT: True
50
+
51
+ USE_CHECKPOINT: True
52
+
53
+ TEST:
54
+ DURING_TRAINING: False
55
+ IMS_PER_BATCH: 64
56
+
57
+ # use for grounding model
58
+ DATASETS:
59
+ TRAIN: ("coco_2017_train", )
60
+ TEST: ("coco_2017_val", )
61
+ DISABLE_SHUFFLE: False
62
+ ADD_DET_PROMPT: False
63
+ RANDOM_SAMPLE_NEG: 85
64
+ CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
65
+
66
+ SEPARATION_TOKENS: ". "
67
+ DESCRIPTION_FILE: "DATASET/coco/annotations/coco.description.v1.json"
68
+ INPUT:
69
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
70
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
71
+ MIN_SIZE_TRAIN: 800
72
+ MAX_SIZE_TRAIN: 1333
73
+ MIN_SIZE_TEST: 800
74
+ MAX_SIZE_TEST: 1333
75
+
76
+ AUGMENT:
77
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
78
+
79
+ DATALOADER:
80
+ SIZE_DIVISIBILITY: 32
81
+
82
+ SOLVER:
83
+ OPTIMIZER: ADAMW
84
+ BASE_LR: 0.0001
85
+ LANG_LR: 0.00001
86
+ WEIGHT_DECAY: 0.0001
87
+ STEPS: (0.67, 0.89)
88
+ MAX_EPOCH: 30
89
+ IMS_PER_BATCH: 64
90
+ WARMUP_ITERS: 2000
91
+ WARMUP_FACTOR: 0.001
92
+ USE_AMP: True
93
+ MODEL_EMA: 0.999
94
+ FIND_UNUSED_PARAMETERS: False
95
+
96
+ CLIP_GRADIENTS:
97
+ ENABLED: True
98
+ CLIP_TYPE: "full_model"
99
+ CLIP_VALUE: 1.0
100
+ NORM_TYPE: 2.0
configs/pretrain/glip_Swing_T_flickr.yaml ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ CONV_BODY: "SWINT-FPN-RETINANET"
9
+ OUT_CHANNELS: 256
10
+
11
+ LANGUAGE_BACKBONE:
12
+ FREEZE: False
13
+ MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
14
+ MASK_SPECIAL: False
15
+
16
+ RPN:
17
+ USE_FPN: True
18
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
19
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
20
+ ASPECT_RATIOS: (1.0,)
21
+ SCALES_PER_OCTAVE: 1
22
+
23
+ DYHEAD:
24
+ CHANNELS: 256
25
+ NUM_CONVS: 6
26
+ USE_GN: True
27
+ USE_DYRELU: True
28
+ USE_DFCONV: True
29
+ USE_DYFUSE: True
30
+ TOPK: 9 # topk for selecting candidate positive samples from each level
31
+ SCORE_AGG: "MEAN"
32
+ LOG_SCALE: 0.0
33
+
34
+ FUSE_CONFIG:
35
+ EARLY_FUSE_ON: True
36
+ TYPE: "MHA-B" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
37
+ USE_CLASSIFICATION_LOSS: False
38
+ USE_TOKEN_LOSS: False
39
+ USE_CONTRASTIVE_ALIGN_LOSS: False
40
+ CONTRASTIVE_HIDDEN_DIM: 64
41
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
42
+ USE_LAYER_SCALE: True
43
+ CLAMP_MIN_FOR_UNDERFLOW: True
44
+ CLAMP_MAX_FOR_OVERFLOW: True
45
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
46
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
47
+ CLAMP_DOT_PRODUCT: True
48
+
49
+ # use for grounding model
50
+ DATASETS:
51
+ REGISTER:
52
+ bing_caption_train:
53
+ yaml_path: "GCC/CC3M/yamls"
54
+ yaml_name: "tiny"
55
+ yaml_name_no_coco: "tiny"
56
+
57
+ # PREDOWNLOAD_BING : True
58
+ # PREDOWNLOAD_WITH_AZCOPY : True
59
+
60
+ TRAIN: ("mixed_train_no_coco", ) #"bing_caption_train_no_coco")
61
+ # TRAIN: ("bing_caption_train", "mixed_train", "flickr30k_train", "coco_grounding_train", )
62
+ TEST: ("coco_2017_val", )
63
+ BING_INDEX_LIST: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
64
+ # BING_INDEX_LIST: [ 0, 1, ]
65
+ ONE_HOT: False
66
+ FLICKR_COPY: 4
67
+ MIXED_COPY: 4
68
+ OBJECT365_COPY: 2
69
+ DISABLE_SHUFFLE: False
70
+ ADD_DET_PROMPT: False
71
+ RANDOM_SAMPLE_NEG: 85
72
+ CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
73
+ FURTHER_SCREEN: True
74
+ CAPTION_CONF: 0.5
75
+ CAPTION_NMS: -1.0
76
+ CAPTION_MIN_BOX: 1
77
+
78
+ SEPARATION_TOKENS: ". "
79
+
80
+ PACK_RANDOM_CAPTION_NUMBER: 20
81
+ NO_RANDOM_PACK_PROBABILITY: 0.4
82
+ RANDOM_PACK_PROB: 0.5
83
+ CAPTION_FORMAT_VERSION: "v2"
84
+
85
+
86
+ INPUT:
87
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
88
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
89
+ MIN_SIZE_TRAIN: 800
90
+ MAX_SIZE_TRAIN: 1333
91
+ MIN_SIZE_TEST: 800
92
+ MAX_SIZE_TEST: 1333
93
+
94
+ AUGMENT:
95
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
96
+
97
+ DATALOADER:
98
+ SIZE_DIVISIBILITY: 32
99
+ DISTRIBUTE_CHUNK_AMONG_NODE: False
100
+
101
+ SOLVER:
102
+ OPTIMIZER: ADAMW
103
+ BASE_LR: 0.0001
104
+ LANG_LR: 0.00001
105
+ WEIGHT_DECAY: 0.0001
106
+ STEPS: (0.67, 0.89)
107
+ MAX_EPOCH: 12
108
+ IMS_PER_BATCH: 64
109
+ WARMUP_ITERS: 2000
110
+ WARMUP_FACTOR: 0.001
111
+
112
+ CLIP_GRADIENTS:
113
+ ENABLED: True
114
+ CLIP_TYPE: "full_model"
115
+ CLIP_VALUE: 1.0
116
+ NORM_TYPE: 2.0
configs/pretrain/glip_large.yaml ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "swin_large_patch4_window12_384_22k.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ CONV_BODY: "SWINT-FPN-RETINANET"
9
+ OUT_CHANNELS: 256
10
+
11
+ SWINT:
12
+ EMBED_DIM: 192
13
+ DEPTHS: (2, 2, 18, 2)
14
+ NUM_HEADS: (6, 12, 24, 48)
15
+ WINDOW_SIZE: 12
16
+ OUT_CHANNELS: (192, 384, 768, 1536)
17
+ DROP_PATH_RATE: 0.4
18
+
19
+ LANGUAGE_BACKBONE:
20
+ FREEZE: False
21
+ MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
22
+ MASK_SPECIAL: False
23
+
24
+ RPN:
25
+ USE_FPN: True
26
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
27
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
28
+ ASPECT_RATIOS: (1.0,)
29
+ SCALES_PER_OCTAVE: 1
30
+
31
+ DYHEAD:
32
+ CHANNELS: 256
33
+ NUM_CONVS: 8
34
+ USE_GN: True
35
+ USE_DYRELU: True
36
+ USE_DFCONV: True
37
+ USE_DYFUSE: True
38
+ TOPK: 9 # topk for selecting candidate positive samples from each level
39
+ SCORE_AGG: "MEAN"
40
+ LOG_SCALE: 0.0
41
+
42
+ USE_CHECKPOINT: True
43
+ FUSE_CONFIG:
44
+ USE_FUSED_FEATURES_DOT_PRODUCT: True
45
+ EARLY_FUSE_ON: True
46
+ TYPE: "MHA-B"
47
+ USE_CLASSIFICATION_LOSS: False
48
+ USE_TOKEN_LOSS: False
49
+ USE_CONTRASTIVE_ALIGN_LOSS: False
50
+ CONTRASTIVE_HIDDEN_DIM: 64
51
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
52
+ USE_LAYER_SCALE: True
53
+ CLAMP_MIN_FOR_UNDERFLOW: True
54
+ CLAMP_MAX_FOR_OVERFLOW: True
55
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
56
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
57
+ CLAMP_DOT_PRODUCT: True
58
+
59
+ DATASETS:
60
+
61
+ TRAIN: ("mixed_train_no_coco",) # Place holder dataset for now. To be updated in the next version
62
+ TEST: ("coco_2017_val", )
63
+
64
+ ONE_HOT: False
65
+ FLICKR_COPY: 8 # 0.15 * 8 = ~1.2M
66
+ MIXED_COPY: 4 # 0.6 * 4 = ~2.4M
67
+ OBJECT365_COPY: 2 # 1.4 * 2 = ~2.8M
68
+ VG_COPY: 3 # 0.4 * 3 = ~1.2M
69
+ IN_COPY: 2 # 0.67 * 2 = ~1.33M
70
+ OI_COPY: 1 # 2M * 1 = 2M
71
+
72
+ DISABLE_SHUFFLE: False
73
+ ADD_DET_PROMPT: False
74
+ RANDOM_SAMPLE_NEG: 85
75
+ CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
76
+ FURTHER_SCREEN: True
77
+ CAPTION_CONF: 0.5
78
+ CAPTION_NMS: -1.0
79
+ CAPTION_MIN_BOX: 1
80
+
81
+ SEPARATION_TOKENS: ". "
82
+
83
+ PACK_RANDOM_CAPTION_NUMBER: 20
84
+ NO_RANDOM_PACK_PROBABILITY: 0.4
85
+ RANDOM_PACK_PROB: 0.5
86
+ CAPTION_FORMAT_VERSION: "v2"
87
+
88
+ INPUT:
89
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
90
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
91
+ MIN_SIZE_TRAIN: 800
92
+ MAX_SIZE_TRAIN: 1333
93
+ MIN_SIZE_TEST: 800
94
+ MAX_SIZE_TEST: 1333
95
+
96
+ AUGMENT:
97
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
98
+
99
+ DATALOADER:
100
+ SIZE_DIVISIBILITY: 32
101
+
102
+ SOLVER:
103
+ OPTIMIZER: ADAMW
104
+ BASE_LR: 0.0001
105
+ LANG_LR: 0.00001
106
+ WEIGHT_DECAY: 0.01
107
+ WEIGHT_DECAY_SCHEDULE: True
108
+ STEPS: (0.67, 0.89)
109
+ MAX_ITER: 1000000
110
+ IMS_PER_BATCH: 64
111
+ WARMUP_ITERS: 2000
112
+ WARMUP_FACTOR: 0.001
113
+
114
+ FIND_UNUSED_PARAMETERS: False
115
+
116
+ CLIP_GRADIENTS:
117
+ ENABLED: True
118
+ CLIP_TYPE: "full_model"
119
+ CLIP_VALUE: 1.0
120
+ NORM_TYPE: 2.0
configs/pretrain/mixed_nococo_flickr_objects365.yaml ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "MODEL/swin_base_patch4_window7_224.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ FUSION_VERSION: "v2"
9
+ CONV_BODY: "SWINT-FPN-RETINANET"
10
+ OUT_CHANNELS: 256
11
+
12
+ SWINT:
13
+ VERSION: "fusion"
14
+ EMBED_DIM: 128
15
+ DEPTHS: (2, 2, 18, 2)
16
+ NUM_HEADS: (4, 8, 16, 32)
17
+ WINDOW_SIZE: 12
18
+ OUT_CHANNELS: (128, 256, 512, 1024)
19
+ DROP_PATH_RATE: 0.4
20
+
21
+ LANGUAGE_BACKBONE:
22
+ FREEZE: False
23
+ MODEL_TYPE: "roberta-fused-v2"
24
+ MASK_SPECIAL: False
25
+ TOKENIZER_TYPE: "roberta-base"
26
+ USE_CHECKPOINT: False
27
+
28
+ RPN:
29
+ USE_FPN: True
30
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
31
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
32
+ ASPECT_RATIOS: (1.0,)
33
+ SCALES_PER_OCTAVE: 1
34
+
35
+ DYHEAD:
36
+ CHANNELS: 256
37
+ NUM_CONVS: 6
38
+ USE_GN: True
39
+ USE_DYRELU: True
40
+ USE_DFCONV: True
41
+ USE_DYFUSE: True
42
+ TOPK: 9 # topk for selecting candidate positive samples from each level
43
+ SCORE_AGG: "MEAN"
44
+ LOG_SCALE: 0.0
45
+
46
+ USE_CHECKPOINT: True
47
+ FUSE_CONFIG:
48
+ USE_FUSED_FEATURES_DOT_PRODUCT: False
49
+ EARLY_FUSE_ON: False
50
+ TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
51
+ USE_CLASSIFICATION_LOSS: False
52
+ USE_TOKEN_LOSS: False
53
+ USE_CONTRASTIVE_ALIGN_LOSS: False
54
+ CONTRASTIVE_HIDDEN_DIM: 64
55
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
56
+ USE_LAYER_SCALE: True
57
+ CLAMP_MIN_FOR_UNDERFLOW: True
58
+ CLAMP_MAX_FOR_OVERFLOW: True
59
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
60
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
61
+ CLAMP_DOT_PRODUCT: True
62
+
63
+ DATASETS:
64
+ TRAIN: ("mixed_train_no_coco", "flickr30k_train", "object365_dt_train" )
65
+ TEST: ("coco_2017_val", )
66
+ ADD_DET_PROMPT: False
67
+ ADD_DET_PROMPT_ADVANCED: False
68
+ ALTERNATIVE_TRAINING: False
69
+ BOX_THRESHOLD: 0.1
70
+ CAPTION_CONF: 0.9
71
+ CAPTION_FORMAT_VERSION: "v2"
72
+ CAPTION_MIN_BOX: 1
73
+ CAPTION_NMS: 0.9
74
+ CLASS_AGNOSTIC: False
75
+ CLASS_CONCAT: False
76
+ COCO_COPY: 1
77
+ CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
78
+ DISABLE_CLIP_TO_IMAGE: False
79
+ DISABLE_SHUFFLE: False
80
+ FEW_SHOT: 0
81
+ FLICKR_COPY: 1
82
+ FLICKR_GT_TYPE: "separate"
83
+ FULL_QUESTION_PROB: 0.5
84
+ FURTHER_SCREEN: False
85
+ GENERAL_COPY: -1
86
+ GENERAL_COPY_TEST: -1
87
+ INFERENCE_CAPTION: False
88
+ IN_COPY: 1
89
+ LOCAL_DEBUG: False
90
+ LVIS_COPY: 1
91
+ LVIS_USE_NORMAL_AP: False
92
+ MAX_BOX: -1
93
+ MIXED_COPY: 1
94
+ MULTISTAGE_TRAINING: False
95
+ NEG_QUESTION_PROB: 0.8
96
+ NO_MINUS_ONE_FOR_ONE_HOT: False
97
+ OBJECT365_COPY: 1
98
+ OI_COPY: 1
99
+ ONE_HOT: False
100
+ PACK_RANDOM_CAPTION_NUMBER: 0
101
+ POS_QUESTION_PROB: 0.6
102
+ PREDOWNLOAD_BING: False
103
+ PREDOWNLOAD_WITH_AZCOPY: False
104
+ PROMPT_LIMIT_NEG: -1
105
+ RANDOM_SAMPLE_NEG: 85
106
+
107
+ REPLACE_CLEAN_LABEL: False
108
+ SAFEGUARD_POSITIVE_CAPTION: True
109
+ SEPARATION_TOKENS: ". "
110
+ SHUFFLE_SEED: 0
111
+ TEST_DATASETNAME_SUFFIX: ""
112
+ TRAIN_DATASETNAME_SUFFIX: ""
113
+ USE_CAPTION_PROMPT: False
114
+ USE_COCO_FORMAT: False
115
+ USE_CROWD: False
116
+ USE_OD_AUG: False
117
+ USE_OVERRIDE_CATEGORY: False
118
+ USE_SUPRESS_QUERY: False
119
+ VG_COPY: 1
120
+
121
+ INPUT:
122
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
123
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
124
+ MIN_SIZE_TRAIN: 800
125
+ MAX_SIZE_TRAIN: 1333
126
+ MIN_SIZE_TEST: 800
127
+ MAX_SIZE_TEST: 1333
128
+
129
+ AUGMENT:
130
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
131
+
132
+ DATALOADER:
133
+ SIZE_DIVISIBILITY: 32
134
+ DISTRIBUTE_CHUNK_AMONG_NODE: False
135
+
136
+ SOLVER:
137
+ OPTIMIZER: ADAMW
138
+ BASE_LR: 0.0001
139
+ LANG_LR: 0.00001
140
+ WEIGHT_DECAY: 0.01
141
+ WEIGHT_DECAY_SCHEDULE: True
142
+ STEPS: (0.67, 0.89)
143
+ MAX_ITER: 800000
144
+ IMS_PER_BATCH: 64
145
+ WARMUP_ITERS: 2000
146
+ WARMUP_FACTOR: 0.001
147
+ TEST_WITH_INFERENCE: True
148
+ FIND_UNUSED_PARAMETERS: False
149
+ USE_AMP: True
150
+ MODEL_EMA: 0.999
151
+ CHECKPOINT_PERIOD: 2500
152
+
153
+
154
+ CLIP_GRADIENTS:
155
+ ENABLED: True
156
+ CLIP_TYPE: "full_model"
157
+ CLIP_VALUE: 1.0
158
+ NORM_TYPE: 2.0
159
+
160
+ TEST:
161
+ DURING_TRAINING: False
162
+ IMS_PER_BATCH: 64
configs/pretrain/mixed_nococo_flickr_objects365_refexpclean.yaml ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "swin_large_patch4_window12_384_22k.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ FUSION_VERSION: "v3"
9
+ CONV_BODY: "SWINT-FPN-RETINANET"
10
+ OUT_CHANNELS: 256
11
+
12
+ SWINT:
13
+ VERSION: "fusion"
14
+ EMBED_DIM: 128
15
+ DEPTHS: (2, 2, 18, 2)
16
+ NUM_HEADS: (4, 8, 16, 32)
17
+ WINDOW_SIZE: 12
18
+ OUT_CHANNELS: (128, 256, 512, 1024)
19
+ DROP_PATH_RATE: 0.4
20
+
21
+ LANGUAGE_BACKBONE:
22
+ FREEZE: False
23
+ MODEL_TYPE: "roberta-fused-v2"
24
+ MASK_SPECIAL: False
25
+ TOKENIZER_TYPE: "roberta-base"
26
+ USE_CHECKPOINT: False
27
+
28
+ RPN:
29
+ USE_FPN: True
30
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
31
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
32
+ ASPECT_RATIOS: (1.0,)
33
+ SCALES_PER_OCTAVE: 1
34
+
35
+ DYHEAD:
36
+ CHANNELS: 256
37
+ NUM_CONVS: 6
38
+ USE_GN: True
39
+ USE_DYRELU: True
40
+ USE_DFCONV: True
41
+ USE_DYFUSE: True
42
+ TOPK: 9 # topk for selecting candidate positive samples from each level
43
+ SCORE_AGG: "MEAN"
44
+ LOG_SCALE: 0.0
45
+
46
+ USE_CHECKPOINT: True
47
+ FUSE_CONFIG:
48
+ USE_FUSED_FEATURES_DOT_PRODUCT: False
49
+ EARLY_FUSE_ON: False
50
+ TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
51
+ USE_CLASSIFICATION_LOSS: False
52
+ USE_TOKEN_LOSS: False
53
+ USE_CONTRASTIVE_ALIGN_LOSS: False
54
+ CONTRASTIVE_HIDDEN_DIM: 64
55
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
56
+ USE_LAYER_SCALE: True
57
+ CLAMP_MIN_FOR_UNDERFLOW: True
58
+ CLAMP_MAX_FOR_OVERFLOW: True
59
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
60
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
61
+ CLAMP_DOT_PRODUCT: True
62
+
63
+ DATASETS:
64
+ TRAIN: ("mixed_train_no_coco", "flickr30k_train", "object365_dt_train" )
65
+ TEST: ("coco_2017_val", )
66
+ ADD_DET_PROMPT: False
67
+ ADD_DET_PROMPT_ADVANCED: False
68
+ ALTERNATIVE_TRAINING: False
69
+ BOX_THRESHOLD: 0.1
70
+ CAPTION_CONF: 0.9
71
+ CAPTION_FORMAT_VERSION: "v2"
72
+ CAPTION_MIN_BOX: 1
73
+ CAPTION_NMS: 0.9
74
+ CLASS_AGNOSTIC: False
75
+ CLASS_CONCAT: False
76
+ COCO_COPY: 1
77
+ CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
78
+ DISABLE_CLIP_TO_IMAGE: False
79
+ DISABLE_SHUFFLE: False
80
+ FEW_SHOT: 0
81
+ FLICKR_COPY: 1
82
+ FLICKR_GT_TYPE: "separate"
83
+ FULL_QUESTION_PROB: 0.5
84
+ FURTHER_SCREEN: False
85
+ GENERAL_COPY: -1
86
+ GENERAL_COPY_TEST: -1
87
+ INFERENCE_CAPTION: False
88
+ IN_COPY: 1
89
+ LOCAL_DEBUG: False
90
+ LVIS_COPY: 1
91
+ LVIS_USE_NORMAL_AP: False
92
+ MAX_BOX: -1
93
+ MIXED_COPY: 1
94
+ MULTISTAGE_TRAINING: False
95
+ NEG_QUESTION_PROB: 0.8
96
+ NO_MINUS_ONE_FOR_ONE_HOT: False
97
+ OBJECT365_COPY: 1
98
+ OI_COPY: 1
99
+ ONE_HOT: False
100
+ PACK_RANDOM_CAPTION_NUMBER: 0
101
+ POS_QUESTION_PROB: 0.6
102
+ PREDOWNLOAD_BING: False
103
+ PREDOWNLOAD_WITH_AZCOPY: False
104
+ PROMPT_LIMIT_NEG: -1
105
+ RANDOM_SAMPLE_NEG: 85
106
+
107
+ REPLACE_CLEAN_LABEL: False
108
+ SAFEGUARD_POSITIVE_CAPTION: True
109
+ SEPARATION_TOKENS: ". "
110
+ SHUFFLE_SEED: 0
111
+ TEST_DATASETNAME_SUFFIX: ""
112
+ TRAIN_DATASETNAME_SUFFIX: ""
113
+ USE_CAPTION_PROMPT: False
114
+ USE_COCO_FORMAT: False
115
+ USE_CROWD: False
116
+ USE_OD_AUG: False
117
+ USE_OVERRIDE_CATEGORY: False
118
+ USE_SUPRESS_QUERY: False
119
+ VG_COPY: 1
120
+
121
+ INPUT:
122
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
123
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
124
+ MIN_SIZE_TRAIN: 800
125
+ MAX_SIZE_TRAIN: 1333
126
+ MIN_SIZE_TEST: 800
127
+ MAX_SIZE_TEST: 1333
128
+
129
+ AUGMENT:
130
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
131
+
132
+ DATALOADER:
133
+ SIZE_DIVISIBILITY: 32
134
+ DISTRIBUTE_CHUNK_AMONG_NODE: False
135
+
136
+ SOLVER:
137
+ OPTIMIZER: ADAMW
138
+ BASE_LR: 0.0001
139
+ LANG_LR: 0.00001
140
+ WEIGHT_DECAY: 0.01
141
+ WEIGHT_DECAY_SCHEDULE: True
142
+ STEPS: (0.67, 0.89)
143
+ MAX_ITER: 800000
144
+ IMS_PER_BATCH: 64
145
+ WARMUP_ITERS: 5000
146
+ WARMUP_FACTOR: 0.001
147
+ TEST_WITH_INFERENCE: True
148
+ FIND_UNUSED_PARAMETERS: False
149
+ USE_AMP: True
150
+ MODEL_EMA: 0.999
151
+ CHECKPOINT_PERIOD: 2500
152
+
153
+
154
+ CLIP_GRADIENTS:
155
+ ENABLED: True
156
+ CLIP_TYPE: "full_model"
157
+ CLIP_VALUE: 1.0
158
+ NORM_TYPE: 2.0
159
+
160
+ TEST:
161
+ DURING_TRAINING: False
162
+ IMS_PER_BATCH: 64
configs/pretrain_new/desco_fiber.yaml ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "MODEL/swin_base_patch4_window7_224.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+
7
+ BACKBONE:
8
+ FUSION_VERSION: "v2"
9
+ CONV_BODY: "SWINT-FPN-RETINANET"
10
+ OUT_CHANNELS: 256
11
+
12
+ SWINT:
13
+ VERSION: "fusion"
14
+ EMBED_DIM: 128
15
+ DEPTHS: (2, 2, 18, 2)
16
+ NUM_HEADS: (4, 8, 16, 32)
17
+ WINDOW_SIZE: 12
18
+ OUT_CHANNELS: (128, 256, 512, 1024)
19
+ DROP_PATH_RATE: 0.4
20
+
21
+ LANGUAGE_BACKBONE:
22
+ FREEZE: False
23
+ MODEL_TYPE: "roberta-fused-v2"
24
+ MASK_SPECIAL: False
25
+ TOKENIZER_TYPE: "roberta-base"
26
+ USE_CHECKPOINT: False
27
+
28
+ RPN:
29
+ USE_FPN: True
30
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
31
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
32
+ ASPECT_RATIOS: (1.0,)
33
+ SCALES_PER_OCTAVE: 1
34
+
35
+ DYHEAD:
36
+ CHANNELS: 256
37
+ NUM_CONVS: 6
38
+ USE_GN: True
39
+ USE_DYRELU: True
40
+ USE_DFCONV: True
41
+ USE_DYFUSE: True
42
+ TOPK: 9 # topk for selecting candidate positive samples from each level
43
+ SCORE_AGG: "MEAN"
44
+ LOG_SCALE: 0.0
45
+
46
+ USE_CHECKPOINT: True
47
+ FUSE_CONFIG:
48
+ USE_FUSED_FEATURES_DOT_PRODUCT: False
49
+ EARLY_FUSE_ON: False
50
+ TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
51
+ USE_CLASSIFICATION_LOSS: False
52
+ USE_TOKEN_LOSS: False
53
+ USE_CONTRASTIVE_ALIGN_LOSS: False
54
+ CONTRASTIVE_HIDDEN_DIM: 64
55
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
56
+ USE_LAYER_SCALE: True
57
+ CLAMP_MIN_FOR_UNDERFLOW: True
58
+ CLAMP_MAX_FOR_OVERFLOW: True
59
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
60
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
61
+ CLAMP_DOT_PRODUCT: True
62
+
63
+ DATASETS:
64
+ REGISTER:
65
+ bing_caption_train:
66
+ yaml_path: "GCC/CC3M/yamls"
67
+ yaml_name: "tiny.noun.harsh"
68
+ yaml_name_no_coco: "tiny.noun.harsh"
69
+ mixed_train_no_coco_noun:
70
+ coco_img_dir: "coco/train2014"
71
+ vg_img_dir: "gqa/images"
72
+ ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns.json"
73
+ mixed_train_no_coco_gpt:
74
+ coco_img_dir: "coco/train2014"
75
+ vg_img_dir: "gqa/images"
76
+ ann_file: "mdetr_annotations/final_mixed_train_no_coco_gpt.v1.new.json"
77
+ flickr30k_train_gpt:
78
+ img_folder: "flickr30k/flickr30k_images/train"
79
+ ann_file: "mdetr_annotations/final_flickr_separateGT_train_gpt.v1.json"
80
+ is_train: True
81
+ mixed_train_no_coco_noun_gpt:
82
+ coco_img_dir: "coco/train2014"
83
+ vg_img_dir: "gqa/images"
84
+ ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.v1.json"
85
+ mixed_train_no_coco_noun_gpt_0425:
86
+ coco_img_dir: "coco/train2014"
87
+ vg_img_dir: "gqa/images"
88
+ ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.0425.json"
89
+ flickr30k_train_gpt_0425:
90
+ img_folder: "flickr30k/flickr30k_images/train"
91
+ ann_file: "mdetr_annotations/final_flickr_separateGT_train_gpt.0425.json"
92
+ is_train: True
93
+
94
+ CAPTION_CONF: 0.4
95
+ OD_TO_GROUNDING_VERSION: "description.gpt.v10.mixed.allow_zero.v1"
96
+ CAPTION_AUGMENTATION_VERSION: "mixed.v4.8-2.drop_positive.control_pos.grouping.v1"
97
+ CC_CAPTION_AUGMENTATION_VERSION: "mixed.v3-v4.9-1.drop_positive.control_pos.grouping.v1"
98
+ CAPTION_VOCAB_FILE: "tools/files/joint_vocab.merged.v1.tmp0.davincci.json"
99
+ DESCRIPTION_FILE: "tools/files/o365.description.v1.json"
100
+
101
+ TRAIN: ("mixed_train_no_coco_noun_gpt_0425", "flickr30k_train_gpt_0425", "object365_dt_train", ) # bing_caption_train_no_coco
102
+ TEST: ("coco_2017_val", )
103
+ ADD_DET_PROMPT: False
104
+ ADD_DET_PROMPT_ADVANCED: False
105
+ ALTERNATIVE_TRAINING: False
106
+ BING_INDEX_LIST: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
107
+ ONE_HOT: False
108
+ FLICKR_COPY: 2
109
+ MIXED_COPY: 2
110
+ OBJECT365_COPY: 2
111
+ DISABLE_SHUFFLE: False
112
+ ADD_DET_PROMPT: False
113
+ RANDOM_SAMPLE_NEG: 85
114
+ CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
115
+ FURTHER_SCREEN: True
116
+
117
+ CAPTION_NMS: -1.0
118
+ CAPTION_MIN_BOX: 1
119
+
120
+ SEPARATION_TOKENS: ". "
121
+
122
+ PACK_RANDOM_CAPTION_NUMBER: 20
123
+ NO_RANDOM_PACK_PROBABILITY: 0.4
124
+ RANDOM_PACK_PROB: 0.5
125
+ CAPTION_FORMAT_VERSION: "v2"
126
+
127
+ INPUT:
128
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
129
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
130
+ MIN_SIZE_TRAIN: 800
131
+ MAX_SIZE_TRAIN: 1333
132
+ MIN_SIZE_TEST: 800
133
+ MAX_SIZE_TEST: 1333
134
+
135
+ AUGMENT:
136
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
137
+
138
+ DATALOADER:
139
+ SIZE_DIVISIBILITY: 32
140
+ DISTRIBUTE_CHUNK_AMONG_NODE: False
141
+
142
+ SOLVER:
143
+ OPTIMIZER: ADAMW
144
+ BASE_LR: 0.0001
145
+ LANG_LR: 0.00001
146
+ WEIGHT_DECAY: 0.01
147
+ WEIGHT_DECAY_SCHEDULE: True
148
+ STEPS: (0.67, 0.89)
149
+ MAX_ITER: 800000
150
+ IMS_PER_BATCH: 64
151
+ WARMUP_ITERS: 2000
152
+ WARMUP_FACTOR: 0.001
153
+ TEST_WITH_INFERENCE: True
154
+ FIND_UNUSED_PARAMETERS: False
155
+ USE_AMP: True
156
+ MODEL_EMA: 0.999
157
+ CHECKPOINT_PERIOD: 2500
158
+
159
+
160
+ CLIP_GRADIENTS:
161
+ ENABLED: True
162
+ CLIP_TYPE: "full_model"
163
+ CLIP_VALUE: 1.0
164
+ NORM_TYPE: 2.0
165
+
166
+ TEST:
167
+ DURING_TRAINING: False
168
+ IMS_PER_BATCH: 64
configs/pretrain_new/desco_glip.yaml ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # for final GLIP tiny, pre-trained from scratch
2
+ MODEL:
3
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
4
+ WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
5
+ RPN_ONLY: True
6
+ RPN_ARCHITECTURE: "VLDYHEAD"
7
+
8
+ BACKBONE:
9
+ CONV_BODY: "SWINT-FPN-RETINANET"
10
+ OUT_CHANNELS: 256
11
+
12
+ LANGUAGE_BACKBONE:
13
+ FREEZE: False
14
+ MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
15
+ MASK_SPECIAL: False
16
+
17
+ RPN:
18
+ USE_FPN: True
19
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
20
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
21
+ ASPECT_RATIOS: (1.0,)
22
+ SCALES_PER_OCTAVE: 1
23
+
24
+ DYHEAD:
25
+ CHANNELS: 256
26
+ NUM_CONVS: 6
27
+ USE_GN: True
28
+ USE_DYRELU: True
29
+ USE_DFCONV: True
30
+ USE_DYFUSE: True
31
+ TOPK: 9 # topk for selecting candidate positive samples from each level
32
+ SCORE_AGG: "MEAN"
33
+ LOG_SCALE: 0.0
34
+ USE_CHECKPOINT: True
35
+ FUSE_CONFIG:
36
+ EARLY_FUSE_ON: True
37
+ TYPE: "MHA-B" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
38
+ USE_CLASSIFICATION_LOSS: False
39
+ USE_TOKEN_LOSS: False
40
+ USE_CONTRASTIVE_ALIGN_LOSS: False
41
+ CONTRASTIVE_HIDDEN_DIM: 64
42
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
43
+ USE_FUSED_FEATURES_DOT_PRODUCT: True
44
+ USE_LAYER_SCALE: True
45
+ CLAMP_MIN_FOR_UNDERFLOW: True
46
+ CLAMP_MAX_FOR_OVERFLOW: True
47
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
48
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
49
+ CLAMP_DOT_PRODUCT: True
50
+
51
+ # use for grounding model
52
+ DATASETS:
53
+ REGISTER:
54
+ bing_caption_train:
55
+ yaml_path: "GCC/CC3M/yamls"
56
+ yaml_name: "tiny.noun.harsh"
57
+ yaml_name_no_coco: "tiny.noun.harsh"
58
+ mixed_train_no_coco_noun_gpt_0425:
59
+ coco_img_dir: "coco/train2014"
60
+ vg_img_dir: "gqa/images"
61
+ ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.0425.json"
62
+ flickr30k_train_gpt_0425:
63
+ img_folder: "flickr30k/flickr30k_images/train"
64
+ ann_file: "mdetr_annotations/final_flickr_separateGT_train_gpt.0425.json"
65
+ is_train: True
66
+
67
+ CAPTION_CONF: 0.4
68
+
69
+ CAPTION_AUGMENTATION_VERSION: "mixed.v4-v3.5-4-1.drop_positive.control_pos.grouping.v1" # for GoldG data; used by CaptionAugmentation to determine how to perform the augmentation
70
+ OD_TO_GROUNDING_VERSION: "description.gpt.v10.mixed.allow_zero.v1" # for
71
+ CC_CAPTION_AUGMENTATION_VERSION: "mixed.v3.8-2.drop_positive.control_pos.grouping.v1" # for CC data; used by CaptionAugmentation to determine how to perform the augmentation
72
+ CAPTION_VOCAB_FILE: "tools/files/mixed_vocab.v1.tmp0.davincci.chunk1of1.filtered.json"
73
+ DESCRIPTION_FILE: "tools/files/o365.description.v1.json"
74
+
75
+ TRAIN: ("mixed_train_no_coco_noun_gpt_0425", "flickr30k_train_gpt_0425", "object365_dt_train", ) # bing_caption_train_no_coco
76
+ TEST: ("coco_2017_val", )
77
+ BING_INDEX_LIST: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
78
+ # BING_INDEX_LIST: [ 0, 1, ]
79
+ ONE_HOT: False
80
+ FLICKR_COPY: 2
81
+ MIXED_COPY: 2
82
+ OBJECT365_COPY: 1
83
+ DISABLE_SHUFFLE: False
84
+ ADD_DET_PROMPT: False
85
+ RANDOM_SAMPLE_NEG: 85
86
+ CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
87
+ FURTHER_SCREEN: True
88
+
89
+ CAPTION_NMS: -1.0
90
+ CAPTION_MIN_BOX: 1
91
+
92
+ SEPARATION_TOKENS: ". "
93
+
94
+ PACK_RANDOM_CAPTION_NUMBER: 20
95
+ NO_RANDOM_PACK_PROBABILITY: 0.4
96
+ RANDOM_PACK_PROB: 0.5
97
+ CAPTION_FORMAT_VERSION: "v2"
98
+
99
+
100
+ INPUT:
101
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
102
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
103
+ MIN_SIZE_TRAIN: 800
104
+ MAX_SIZE_TRAIN: 1333
105
+ MIN_SIZE_TEST: 800
106
+ MAX_SIZE_TEST: 1333
107
+
108
+ AUGMENT:
109
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
110
+
111
+ DATALOADER:
112
+ SIZE_DIVISIBILITY: 32
113
+ DISTRIBUTE_CHUNK_AMONG_NODE: False
114
+
115
+ SOLVER:
116
+ OPTIMIZER: ADAMW
117
+ BASE_LR: 0.0001
118
+ LANG_LR: 0.00001
119
+ WEIGHT_DECAY: 0.0001
120
+ STEPS: (0.67, 0.89)
121
+ #MAX_EPOCH: 12
122
+ MAX_ITER: 300000
123
+ IMS_PER_BATCH: 64
124
+ WARMUP_ITERS: 2000
125
+ WARMUP_FACTOR: 0.001
126
+ USE_AMP: True
127
+ MODEL_EMA: 0.999
128
+ FIND_UNUSED_PARAMETERS: False
129
+
130
+ CLIP_GRADIENTS:
131
+ ENABLED: True
132
+ CLIP_TYPE: "full_model"
133
+ CLIP_VALUE: 1.0
134
+ NORM_TYPE: 2.0
configs/refcoco.yaml ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "swin_base_patch4_window12_384_22k.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+ ATSS:
7
+ PRE_NMS_TOP_N: 3000
8
+ DETECTIONS_PER_IMG: 100
9
+ INFERENCE_TH: 0.0
10
+
11
+ SWINT:
12
+ VERSION: "fusion"
13
+ EMBED_DIM: 128
14
+ DEPTHS: (2, 2, 18, 2)
15
+ NUM_HEADS: (4, 8, 16, 32)
16
+ WINDOW_SIZE: 12
17
+ OUT_CHANNELS: (128, 256, 512, 1024)
18
+ DROP_PATH_RATE: 0.4
19
+
20
+ BACKBONE:
21
+ FUSION_VERSION: "v3"
22
+ CONV_BODY: "SWINT-FPN-RETINANET"
23
+ OUT_CHANNELS: 256
24
+ USE_CHECKPOINT: True
25
+ FREEZE_CONV_BODY_AT: -1
26
+
27
+ LANGUAGE_BACKBONE:
28
+ FREEZE: False
29
+ MODEL_TYPE: "roberta-fused-v2"
30
+ TOKENIZER_TYPE: "roberta-base"
31
+ LANG_DIM: 768
32
+ MASK_SPECIAL: False
33
+ USE_CHECKPOINT: False
34
+
35
+ RPN:
36
+ USE_FPN: True
37
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
38
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
39
+ ASPECT_RATIOS: (1.0,)
40
+ SCALES_PER_OCTAVE: 1
41
+
42
+ DYHEAD:
43
+ CHANNELS: 256
44
+ NUM_CONVS: 6
45
+ USE_GN: True
46
+ USE_DYRELU: True
47
+ USE_DFCONV: True
48
+ USE_DYFUSE: True
49
+ TOPK: 9
50
+ SCORE_AGG: "MEAN"
51
+ LOG_SCALE: 0.0
52
+ USE_CHECKPOINT: True
53
+
54
+ FUSE_CONFIG:
55
+ EARLY_FUSE_ON: False
56
+ TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
57
+ USE_CLASSIFICATION_LOSS: False
58
+ USE_TOKEN_LOSS: False
59
+ USE_CONTRASTIVE_ALIGN_LOSS: False
60
+ CONTRASTIVE_HIDDEN_DIM: 64
61
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
62
+ USE_LAYER_SCALE: True
63
+ CLAMP_MIN_FOR_UNDERFLOW: True
64
+ CLAMP_MAX_FOR_OVERFLOW: True
65
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
66
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
67
+ CLAMP_DOT_PRODUCT: True
68
+
69
+ # use for grounding model
70
+ DATASETS:
71
+ TRAIN: ("refcoco_train", )
72
+ TEST: ("refcoco_val", )
73
+ DISABLE_SHUFFLE: True
74
+
75
+ INPUT:
76
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
77
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
78
+ MIN_SIZE_TRAIN: 800
79
+ MAX_SIZE_TRAIN: 1333
80
+ MIN_SIZE_TEST: 800
81
+ MAX_SIZE_TEST: 1333
82
+
83
+ AUGMENT:
84
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
85
+ FLIP_PROB_TRAIN: 0.0 # Important for refcoco esp
86
+
87
+ DATALOADER:
88
+ SIZE_DIVISIBILITY: 32
89
+
90
+ SOLVER:
91
+ OPTIMIZER: ADAMW
92
+ BASE_LR: 0.00001
93
+ LANG_LR: 0.00001
94
+ WEIGHT_DECAY: 0.0001
95
+ STEPS: (0.67, 0.89)
96
+ MAX_EPOCH: 20
97
+ IMS_PER_BATCH: 16
98
+ WARMUP_ITERS: 2000
99
+ WARMUP_FACTOR: 0.001
100
+ TEST_WITH_INFERENCE: True
101
+ FIND_UNUSED_PARAMETERS: False
102
+ USE_AMP: True
103
+ MODEL_EMA: 0.999
104
+
105
+ CLIP_GRADIENTS:
106
+ ENABLED: False
107
+ CLIP_TYPE: "full_model"
108
+ CLIP_VALUE: 1.0
109
+ NORM_TYPE: 2.0
110
+
111
+ TEST:
112
+ DURING_TRAINING: True
113
+ EVAL_TASK: "grounding"
114
+ IMS_PER_BATCH: 16
115
+
116
+
configs/refcocog.yaml ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "swin_base_patch4_window12_384_22k.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+ ATSS:
7
+ PRE_NMS_TOP_N: 3000
8
+ DETECTIONS_PER_IMG: 100
9
+ INFERENCE_TH: 0.0
10
+
11
+ SWINT:
12
+ VERSION: "fusion"
13
+ EMBED_DIM: 128
14
+ DEPTHS: (2, 2, 18, 2)
15
+ NUM_HEADS: (4, 8, 16, 32)
16
+ WINDOW_SIZE: 12
17
+ OUT_CHANNELS: (128, 256, 512, 1024)
18
+ DROP_PATH_RATE: 0.4
19
+
20
+ BACKBONE:
21
+ FUSION_VERSION: "v3"
22
+ CONV_BODY: "SWINT-FPN-RETINANET"
23
+ OUT_CHANNELS: 256
24
+ USE_CHECKPOINT: True
25
+ FREEZE_CONV_BODY_AT: -1
26
+
27
+ LANGUAGE_BACKBONE:
28
+ FREEZE: False
29
+ MODEL_TYPE: "roberta-fused-v2"
30
+ TOKENIZER_TYPE: "roberta-base"
31
+ LANG_DIM: 768
32
+ MASK_SPECIAL: False
33
+ USE_CHECKPOINT: False
34
+
35
+ RPN:
36
+ USE_FPN: True
37
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
38
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
39
+ ASPECT_RATIOS: (1.0,)
40
+ SCALES_PER_OCTAVE: 1
41
+
42
+ DYHEAD:
43
+ CHANNELS: 256
44
+ NUM_CONVS: 6
45
+ USE_GN: True
46
+ USE_DYRELU: True
47
+ USE_DFCONV: True
48
+ USE_DYFUSE: True
49
+ TOPK: 9
50
+ SCORE_AGG: "MEAN"
51
+ LOG_SCALE: 0.0
52
+ USE_CHECKPOINT: True
53
+
54
+ FUSE_CONFIG:
55
+ EARLY_FUSE_ON: False
56
+ TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
57
+ USE_CLASSIFICATION_LOSS: False
58
+ USE_TOKEN_LOSS: False
59
+ USE_CONTRASTIVE_ALIGN_LOSS: False
60
+ CONTRASTIVE_HIDDEN_DIM: 64
61
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
62
+ USE_LAYER_SCALE: True
63
+ CLAMP_MIN_FOR_UNDERFLOW: True
64
+ CLAMP_MAX_FOR_OVERFLOW: True
65
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
66
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
67
+ CLAMP_DOT_PRODUCT: True
68
+
69
+ # use for grounding model
70
+ DATASETS:
71
+ TRAIN: ("refcocog_train", )
72
+ TEST: ("refcocog_val",)
73
+ DISABLE_SHUFFLE: True
74
+
75
+ INPUT:
76
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
77
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
78
+ MIN_SIZE_TRAIN: 800
79
+ MAX_SIZE_TRAIN: 1333
80
+ MIN_SIZE_TEST: 800
81
+ MAX_SIZE_TEST: 1333
82
+
83
+ AUGMENT:
84
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
85
+ FLIP_PROB_TRAIN: 0.0 # Important for refcoco esp
86
+
87
+ DATALOADER:
88
+ SIZE_DIVISIBILITY: 32
89
+
90
+ SOLVER:
91
+ OPTIMIZER: ADAMW
92
+ BASE_LR: 0.00001
93
+ LANG_LR: 0.00001
94
+ WEIGHT_DECAY: 0.0001
95
+ STEPS: (0.67, 0.89)
96
+ MAX_EPOCH: 20
97
+ IMS_PER_BATCH: 16
98
+ WARMUP_ITERS: 2000
99
+ WARMUP_FACTOR: 0.001
100
+ TEST_WITH_INFERENCE: True
101
+ FIND_UNUSED_PARAMETERS: False
102
+ USE_AMP: True
103
+ MODEL_EMA: 0.999
104
+
105
+ CLIP_GRADIENTS:
106
+ ENABLED: False
107
+ CLIP_TYPE: "full_model"
108
+ CLIP_VALUE: 1.0
109
+ NORM_TYPE: 2.0
110
+
111
+ TEST:
112
+ DURING_TRAINING: True
113
+ EVAL_TASK: "grounding"
114
+ IMS_PER_BATCH: 16
115
+
116
+
configs/refcocoplus.yaml ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ META_ARCHITECTURE: "GeneralizedVLRCNN"
3
+ WEIGHT: "swin_base_patch4_window12_384_22k.pth"
4
+ RPN_ONLY: True
5
+ RPN_ARCHITECTURE: "VLDYHEAD"
6
+ ATSS:
7
+ PRE_NMS_TOP_N: 3000
8
+ DETECTIONS_PER_IMG: 100
9
+ INFERENCE_TH: 0.0
10
+
11
+ SWINT:
12
+ VERSION: "fusion"
13
+ EMBED_DIM: 128
14
+ DEPTHS: (2, 2, 18, 2)
15
+ NUM_HEADS: (4, 8, 16, 32)
16
+ WINDOW_SIZE: 12
17
+ OUT_CHANNELS: (128, 256, 512, 1024)
18
+ DROP_PATH_RATE: 0.4
19
+
20
+ BACKBONE:
21
+ FUSION_VERSION: "v3"
22
+ CONV_BODY: "SWINT-FPN-RETINANET"
23
+ OUT_CHANNELS: 256
24
+ USE_CHECKPOINT: True
25
+ FREEZE_CONV_BODY_AT: -1
26
+
27
+ LANGUAGE_BACKBONE:
28
+ FREEZE: False
29
+ MODEL_TYPE: "roberta-fused-v2"
30
+ TOKENIZER_TYPE: "roberta-base"
31
+ LANG_DIM: 768
32
+ MASK_SPECIAL: False
33
+ USE_CHECKPOINT: False
34
+
35
+ RPN:
36
+ USE_FPN: True
37
+ ANCHOR_SIZES: (64, 128, 256, 512, 1024)
38
+ ANCHOR_STRIDE: (8, 16, 32, 64, 128)
39
+ ASPECT_RATIOS: (1.0,)
40
+ SCALES_PER_OCTAVE: 1
41
+
42
+ DYHEAD:
43
+ CHANNELS: 256
44
+ NUM_CONVS: 6
45
+ USE_GN: True
46
+ USE_DYRELU: True
47
+ USE_DFCONV: True
48
+ USE_DYFUSE: True
49
+ TOPK: 9
50
+ SCORE_AGG: "MEAN"
51
+ LOG_SCALE: 0.0
52
+ USE_CHECKPOINT: True
53
+
54
+ FUSE_CONFIG:
55
+ EARLY_FUSE_ON: False
56
+ TYPE: "NONE" # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
57
+ USE_CLASSIFICATION_LOSS: False
58
+ USE_TOKEN_LOSS: False
59
+ USE_CONTRASTIVE_ALIGN_LOSS: False
60
+ CONTRASTIVE_HIDDEN_DIM: 64
61
+ USE_DOT_PRODUCT_TOKEN_LOSS: True
62
+ USE_LAYER_SCALE: True
63
+ CLAMP_MIN_FOR_UNDERFLOW: True
64
+ CLAMP_MAX_FOR_OVERFLOW: True
65
+ CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
66
+ CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
67
+ CLAMP_DOT_PRODUCT: True
68
+
69
+ # use for grounding model
70
+ DATASETS:
71
+ TRAIN: ("refcoco+_train", )
72
+ TEST: ("refcoco+_val",)
73
+ DISABLE_SHUFFLE: True
74
+
75
+ INPUT:
76
+ PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
77
+ PIXEL_STD: [ 57.375, 57.120, 58.395 ]
78
+ MIN_SIZE_TRAIN: 800
79
+ MAX_SIZE_TRAIN: 1333
80
+ MIN_SIZE_TEST: 800
81
+ MAX_SIZE_TEST: 1333
82
+
83
+ AUGMENT:
84
+ MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
85
+ FLIP_PROB_TRAIN: 0.0 # Important for refcoco esp
86
+
87
+ DATALOADER:
88
+ SIZE_DIVISIBILITY: 32
89
+
90
+ SOLVER:
91
+ OPTIMIZER: ADAMW
92
+ BASE_LR: 0.00001
93
+ LANG_LR: 0.00001
94
+ WEIGHT_DECAY: 0.0001
95
+ STEPS: (0.67, 0.89)
96
+ MAX_EPOCH: 20
97
+ IMS_PER_BATCH: 16
98
+ WARMUP_ITERS: 2000
99
+ WARMUP_FACTOR: 0.001
100
+ TEST_WITH_INFERENCE: True
101
+ FIND_UNUSED_PARAMETERS: False
102
+ USE_AMP: True
103
+ MODEL_EMA: 0.999
104
+
105
+ CLIP_GRADIENTS:
106
+ ENABLED: False
107
+ CLIP_TYPE: "full_model"
108
+ CLIP_VALUE: 1.0
109
+ NORM_TYPE: 2.0
110
+
111
+ TEST:
112
+ DURING_TRAINING: True
113
+ EVAL_TASK: "grounding"
114
+ IMS_PER_BATCH: 16
115
+
116
+
configs/refexp/_refcoco+_testA.yaml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ ATSS:
3
+ NUM_CLASSES: 8
4
+ FCOS:
5
+ NUM_CLASSES: 8
6
+ ROI_BOX_HEAD:
7
+ NUM_CLASSES: 8
8
+ DYHEAD:
9
+ NUM_CLASSES: 8
10
+ DATASETS:
11
+ TEST: ("refcoco+_testA", )
12
+ FLICKR_GT_TYPE: "separate"
13
+
14
+ INPUT:
15
+ MIN_SIZE_TRAIN: 800
16
+ MAX_SIZE_TRAIN: 1333
17
+ MIN_SIZE_TEST: 800
18
+ MAX_SIZE_TEST: 1333
19
+ DATALOADER:
20
+ SIZE_DIVISIBILITY: 32
21
+ ASPECT_RATIO_GROUPING: False
22
+ SOLVER:
23
+ WARMUP_ITERS: 0
24
+ MAX_EPOCH: 12
25
+ CHECKPOINT_PERIOD: 100
26
+ TEST:
27
+ IMS_PER_BATCH: 8
28
+
29
+
30
+ # local debug command: CUDA_VISIBLE_DEVICES=0 python tools/finetune.py --config-file configs/harold/dyhead_grounding.yaml --ft-tasks configs/odinw/_flickr.yaml --skip-train SOLVER.IMS_PER_BATCH 1 MODEL.WEIGHT OUTPUT/model_0345000.pth OUTPUT_DIR tmp TEST.IMS_PER_BATCH 1 TEST.EVAL_TASK grounding TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM 100
configs/refexp/_refcoco+_testB.yaml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ ATSS:
3
+ NUM_CLASSES: 8
4
+ FCOS:
5
+ NUM_CLASSES: 8
6
+ ROI_BOX_HEAD:
7
+ NUM_CLASSES: 8
8
+ DYHEAD:
9
+ NUM_CLASSES: 8
10
+ DATASETS:
11
+ TEST: ("refcoco+_testB", )
12
+ FLICKR_GT_TYPE: "separate"
13
+
14
+ INPUT:
15
+ MIN_SIZE_TRAIN: 800
16
+ MAX_SIZE_TRAIN: 1333
17
+ MIN_SIZE_TEST: 800
18
+ MAX_SIZE_TEST: 1333
19
+ DATALOADER:
20
+ SIZE_DIVISIBILITY: 32
21
+ ASPECT_RATIO_GROUPING: False
22
+ SOLVER:
23
+ WARMUP_ITERS: 0
24
+ MAX_EPOCH: 12
25
+ CHECKPOINT_PERIOD: 100
26
+ TEST:
27
+ IMS_PER_BATCH: 8
28
+
29
+
30
+ # local debug command: CUDA_VISIBLE_DEVICES=0 python tools/finetune.py --config-file configs/harold/dyhead_grounding.yaml --ft-tasks configs/odinw/_flickr.yaml --skip-train SOLVER.IMS_PER_BATCH 1 MODEL.WEIGHT OUTPUT/model_0345000.pth OUTPUT_DIR tmp TEST.IMS_PER_BATCH 1 TEST.EVAL_TASK grounding TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM 100
configs/refexp/_refcoco_testA.yaml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ ATSS:
3
+ NUM_CLASSES: 8
4
+ FCOS:
5
+ NUM_CLASSES: 8
6
+ ROI_BOX_HEAD:
7
+ NUM_CLASSES: 8
8
+ DYHEAD:
9
+ NUM_CLASSES: 8
10
+ DATASETS:
11
+ TEST: ("refcoco_testA", )
12
+ FLICKR_GT_TYPE: "separate"
13
+
14
+ INPUT:
15
+ MIN_SIZE_TRAIN: 800
16
+ MAX_SIZE_TRAIN: 1333
17
+ MIN_SIZE_TEST: 800
18
+ MAX_SIZE_TEST: 1333
19
+ DATALOADER:
20
+ SIZE_DIVISIBILITY: 32
21
+ ASPECT_RATIO_GROUPING: False
22
+ SOLVER:
23
+ WARMUP_ITERS: 0
24
+ MAX_EPOCH: 12
25
+ CHECKPOINT_PERIOD: 100
26
+ TEST:
27
+ IMS_PER_BATCH: 8
28
+
29
+
30
+ # local debug command: CUDA_VISIBLE_DEVICES=0 python tools/finetune.py --config-file configs/harold/dyhead_grounding.yaml --ft-tasks configs/odinw/_flickr.yaml --skip-train SOLVER.IMS_PER_BATCH 1 MODEL.WEIGHT OUTPUT/model_0345000.pth OUTPUT_DIR tmp TEST.IMS_PER_BATCH 1 TEST.EVAL_TASK grounding TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM 100
configs/refexp/_refcoco_testB.yaml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ ATSS:
3
+ NUM_CLASSES: 8
4
+ FCOS:
5
+ NUM_CLASSES: 8
6
+ ROI_BOX_HEAD:
7
+ NUM_CLASSES: 8
8
+ DYHEAD:
9
+ NUM_CLASSES: 8
10
+ DATASETS:
11
+ TEST: ("refcoco_testB", )
12
+ FLICKR_GT_TYPE: "separate"
13
+
14
+ INPUT:
15
+ MIN_SIZE_TRAIN: 800
16
+ MAX_SIZE_TRAIN: 1333
17
+ MIN_SIZE_TEST: 800
18
+ MAX_SIZE_TEST: 1333
19
+ DATALOADER:
20
+ SIZE_DIVISIBILITY: 32
21
+ ASPECT_RATIO_GROUPING: False
22
+ SOLVER:
23
+ WARMUP_ITERS: 0
24
+ MAX_EPOCH: 12
25
+ CHECKPOINT_PERIOD: 100
26
+ TEST:
27
+ IMS_PER_BATCH: 8
28
+
29
+
30
+ # local debug command: CUDA_VISIBLE_DEVICES=0 python tools/finetune.py --config-file configs/harold/dyhead_grounding.yaml --ft-tasks configs/odinw/_flickr.yaml --skip-train SOLVER.IMS_PER_BATCH 1 MODEL.WEIGHT OUTPUT/model_0345000.pth OUTPUT_DIR tmp TEST.IMS_PER_BATCH 1 TEST.EVAL_TASK grounding TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM 100
configs/refexp/_refcocog_test.yaml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ ATSS:
3
+ NUM_CLASSES: 8
4
+ FCOS:
5
+ NUM_CLASSES: 8
6
+ ROI_BOX_HEAD:
7
+ NUM_CLASSES: 8
8
+ DYHEAD:
9
+ NUM_CLASSES: 8
10
+ DATASETS:
11
+ TEST: ("refcocog_test", )
12
+ FLICKR_GT_TYPE: "separate"
13
+
14
+ INPUT:
15
+ MIN_SIZE_TRAIN: 800
16
+ MAX_SIZE_TRAIN: 1333
17
+ MIN_SIZE_TEST: 800
18
+ MAX_SIZE_TEST: 1333
19
+ DATALOADER:
20
+ SIZE_DIVISIBILITY: 32
21
+ ASPECT_RATIO_GROUPING: False
22
+ SOLVER:
23
+ WARMUP_ITERS: 0
24
+ MAX_EPOCH: 12
25
+ CHECKPOINT_PERIOD: 100
26
+ TEST:
27
+ IMS_PER_BATCH: 8
28
+
29
+
30
+ # local debug command: CUDA_VISIBLE_DEVICES=0 python tools/finetune.py --config-file configs/harold/dyhead_grounding.yaml --ft-tasks configs/odinw/_flickr.yaml --skip-train SOLVER.IMS_PER_BATCH 1 MODEL.WEIGHT OUTPUT/model_0345000.pth OUTPUT_DIR tmp TEST.IMS_PER_BATCH 1 TEST.EVAL_TASK grounding TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM 100
docs/intro.md ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html class="">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no" />
6
+ <meta name="description" content="We’re on a journey to advance and democratize artificial intelligence through open source and open science." />
7
+ <meta property="fb:app_id" content="1321688464574422" />
8
+ <meta name="twitter:card" content="summary_large_image" />
9
+ <meta name="twitter:site" content="@huggingface" />
10
+ <meta property="og:title" content="docs/intro.md · haotiz/glip-zeroshot-demo at main" />
11
+ <meta property="og:type" content="website" />
12
+ <meta property="og:url" content="https://huggingface.co/spaces/haotiz/glip-zeroshot-demo/blob/main/docs/intro.md" />
13
+ <meta property="og:image" content="https://cdn-thumbnails.huggingface.co/social-thumbnails/spaces/haotiz/glip-zeroshot-demo.png" />
14
+
15
+ <link rel="stylesheet" href="/front/build/kube-745aab2/style.css" />
16
+
17
+ <link rel="preconnect" href="https://fonts.gstatic.com" />
18
+ <link
19
+ href="https://fonts.googleapis.com/css2?family=Source+Sans+Pro:ital,wght@0,200;0,300;0,400;0,600;0,700;0,900;1,200;1,300;1,400;1,600;1,700;1,900&display=swap"
20
+ rel="stylesheet"
21
+ />
22
+ <link
23
+ href="https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600;700&display=swap"
24
+ rel="stylesheet"
25
+ />
26
+
27
+ <link
28
+ rel="preload"
29
+ href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.12.0/katex.min.css"
30
+ as="style"
31
+ onload="this.onload=null;this.rel='stylesheet'"
32
+ />
33
+ <noscript>
34
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.12.0/katex.min.css" />
35
+ </noscript>
36
+
37
+
38
+
39
+ <title>docs/intro.md · haotiz/glip-zeroshot-demo at main</title>
40
+
41
+ <script defer data-domain="huggingface.co" src="/js/script.js"></script>
42
+ <script type="text/javascript" src="https://de5282c3ca0c.edge.sdk.awswaf.com/de5282c3ca0c/526cf06acb0d/challenge.js" defer></script>
43
+ </head>
44
+ <body class="flex flex-col min-h-screen bg-white dark:bg-gray-950 text-black ViewerBlobPage">
45
+ <div class="flex min-h-screen flex-col">
46
+ <div class="SVELTE_HYDRATER contents" data-props="{&quot;classNames&quot;:&quot;&quot;,&quot;isWide&quot;:false,&quot;isZh&quot;:false}" data-target="MainHeader"><header class="border-b border-gray-100 "><div class="w-full px-4 container flex h-16 items-center"><div class="flex flex-1 items-center"><a class="mr-5 flex flex-none items-center lg:mr-6" href="/"><img alt="Hugging Face's logo" class="w-7 md:mr-2" src="/front/assets/huggingface_logo-noborder.svg">
47
+ <span class="hidden whitespace-nowrap text-lg font-bold md:block">Hugging Face</span></a>
48
+ <div class="relative flex-1 lg:max-w-sm mr-2 sm:mr-4 lg:mr-6"><input autocomplete="off" class="w-full dark:bg-gray-950 pl-8 form-input-alt h-9 pr-3 focus:shadow-xl" name="" placeholder="Search models, datasets, users..." spellcheck="false" type="text" value="">
49
+ <svg class="absolute left-2.5 text-gray-400 top-1/2 transform -translate-y-1/2" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M30 28.59L22.45 21A11 11 0 1 0 21 22.45L28.59 30zM5 14a9 9 0 1 1 9 9a9 9 0 0 1-9-9z" fill="currentColor"></path></svg>
50
+ </div>
51
+ <div class="flex flex-none items-center justify-center p-0.5 place-self-stretch lg:hidden"><button class="relative z-40 flex h-6 w-8 items-center justify-center" type="button"><svg width="1em" height="1em" viewBox="0 0 10 10" class="text-xl" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" preserveAspectRatio="xMidYMid meet" fill="currentColor"><path fill-rule="evenodd" clip-rule="evenodd" d="M1.65039 2.9999C1.65039 2.8066 1.80709 2.6499 2.00039 2.6499H8.00039C8.19369 2.6499 8.35039 2.8066 8.35039 2.9999C8.35039 3.1932 8.19369 3.3499 8.00039 3.3499H2.00039C1.80709 3.3499 1.65039 3.1932 1.65039 2.9999ZM1.65039 4.9999C1.65039 4.8066 1.80709 4.6499 2.00039 4.6499H8.00039C8.19369 4.6499 8.35039 4.8066 8.35039 4.9999C8.35039 5.1932 8.19369 5.3499 8.00039 5.3499H2.00039C1.80709 5.3499 1.65039 5.1932 1.65039 4.9999ZM2.00039 6.6499C1.80709 6.6499 1.65039 6.8066 1.65039 6.9999C1.65039 7.1932 1.80709 7.3499 2.00039 7.3499H8.00039C8.19369 7.3499 8.35039 7.1932 8.35039 6.9999C8.35039 6.8066 8.19369 6.6499 8.00039 6.6499H2.00039Z"></path></svg>
52
+ </button>
53
+
54
+ </div></div>
55
+ <nav aria-label="Main" class="ml-auto hidden lg:block"><ul class="flex items-center space-x-2"><li><a class="group flex items-center px-2 py-0.5 dark:hover:text-gray-400 hover:text-indigo-700" href="/models"><svg class="mr-1.5 text-gray-400 group-hover:text-indigo-500" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><path class="uim-quaternary" d="M20.23 7.24L12 12L3.77 7.24a1.98 1.98 0 0 1 .7-.71L11 2.76c.62-.35 1.38-.35 2 0l6.53 3.77c.29.173.531.418.7.71z" opacity=".25" fill="currentColor"></path><path class="uim-tertiary" d="M12 12v9.5a2.09 2.09 0 0 1-.91-.21L4.5 17.48a2.003 2.003 0 0 1-1-1.73v-7.5a2.06 2.06 0 0 1 .27-1.01L12 12z" opacity=".5" fill="currentColor"></path><path class="uim-primary" d="M20.5 8.25v7.5a2.003 2.003 0 0 1-1 1.73l-6.62 3.82c-.275.13-.576.198-.88.2V12l8.23-4.76c.175.308.268.656.27 1.01z" fill="currentColor"></path></svg>
56
+ Models</a>
57
+ </li><li><a class="group flex items-center px-2 py-0.5 dark:hover:text-gray-400 hover:text-red-700" href="/datasets"><svg class="mr-1.5 text-gray-400 group-hover:text-red-500" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 25 25"><ellipse cx="12.5" cy="5" fill="currentColor" fill-opacity="0.25" rx="7.5" ry="2"></ellipse><path d="M12.5 15C16.6421 15 20 14.1046 20 13V20C20 21.1046 16.6421 22 12.5 22C8.35786 22 5 21.1046 5 20V13C5 14.1046 8.35786 15 12.5 15Z" fill="currentColor" opacity="0.5"></path><path d="M12.5 7C16.6421 7 20 6.10457 20 5V11.5C20 12.6046 16.6421 13.5 12.5 13.5C8.35786 13.5 5 12.6046 5 11.5V5C5 6.10457 8.35786 7 12.5 7Z" fill="currentColor" opacity="0.5"></path><path d="M5.23628 12C5.08204 12.1598 5 12.8273 5 13C5 14.1046 8.35786 15 12.5 15C16.6421 15 20 14.1046 20 13C20 12.8273 19.918 12.1598 19.7637 12C18.9311 12.8626 15.9947 13.5 12.5 13.5C9.0053 13.5 6.06886 12.8626 5.23628 12Z" fill="currentColor"></path></svg>
58
+ Datasets</a>
59
+ </li><li><a class="group flex items-center px-2 py-0.5 dark:hover:text-gray-400 hover:text-blue-700" href="/spaces"><svg class="mr-1.5 text-gray-400 group-hover:text-blue-500" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" viewBox="0 0 25 25"><path opacity=".5" d="M6.016 14.674v4.31h4.31v-4.31h-4.31ZM14.674 14.674v4.31h4.31v-4.31h-4.31ZM6.016 6.016v4.31h4.31v-4.31h-4.31Z" fill="currentColor"></path><path opacity=".75" fill-rule="evenodd" clip-rule="evenodd" d="M3 4.914C3 3.857 3.857 3 4.914 3h6.514c.884 0 1.628.6 1.848 1.414a5.171 5.171 0 0 1 7.31 7.31c.815.22 1.414.964 1.414 1.848v6.514A1.914 1.914 0 0 1 20.086 22H4.914A1.914 1.914 0 0 1 3 20.086V4.914Zm3.016 1.102v4.31h4.31v-4.31h-4.31Zm0 12.968v-4.31h4.31v4.31h-4.31Zm8.658 0v-4.31h4.31v4.31h-4.31Zm0-10.813a2.155 2.155 0 1 1 4.31 0 2.155 2.155 0 0 1-4.31 0Z" fill="currentColor"></path><path opacity=".25" d="M16.829 6.016a2.155 2.155 0 1 0 0 4.31 2.155 2.155 0 0 0 0-4.31Z" fill="currentColor"></path></svg>
60
+ Spaces</a>
61
+ </li><li><a class="group flex items-center px-2 py-0.5 dark:hover:text-gray-400 hover:text-yellow-700" href="/docs"><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="mr-1.5 text-gray-400 group-hover:text-yellow-500" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path opacity="0.5" d="M20.9022 5.10334L10.8012 10.8791L7.76318 9.11193C8.07741 8.56791 8.5256 8.11332 9.06512 7.7914L15.9336 3.73907C17.0868 3.08811 18.5002 3.26422 19.6534 3.91519L19.3859 3.73911C19.9253 4.06087 20.5879 4.56025 20.9022 5.10334Z" fill="currentColor"></path><path d="M10.7999 10.8792V28.5483C10.2136 28.5475 9.63494 28.4139 9.10745 28.1578C8.5429 27.8312 8.074 27.3621 7.74761 26.7975C7.42122 26.2327 7.24878 25.5923 7.24756 24.9402V10.9908C7.25062 10.3319 7.42358 9.68487 7.74973 9.1123L10.7999 10.8792Z" fill="currentColor" fill-opacity="0.75"></path><path fill-rule="evenodd" clip-rule="evenodd" d="M21.3368 10.8499V6.918C21.3331 6.25959 21.16 5.61234 20.8346 5.03949L10.7971 10.8727L10.8046 10.874L21.3368 10.8499Z" fill="currentColor"></path><path opacity="0.5" d="M21.7937 10.8488L10.7825 10.8741V28.5486L21.7937 28.5234C23.3344 28.5234 24.5835 27.2743 24.5835 25.7335V13.6387C24.5835 12.0979 23.4365 11.1233 21.7937 10.8488Z" fill="currentColor"></path></svg>
62
+ Docs</a>
63
+ </li>
64
+ <li><div class="relative ">
65
+ <button class="px-2 py-0.5 group hover:text-green-700 dark:hover:text-gray-400 flex items-center " type="button">
66
+ <svg class="mr-1.5 text-gray-400 group-hover:text-green-500" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><path class="uim-tertiary" d="M19 6H5a3 3 0 0 0-3 3v2.72L8.837 14h6.326L22 11.72V9a3 3 0 0 0-3-3z" opacity=".5" fill="currentColor"></path><path class="uim-primary" d="M10 6V5h4v1h2V5a2.002 2.002 0 0 0-2-2h-4a2.002 2.002 0 0 0-2 2v1h2zm-1.163 8L2 11.72V18a3.003 3.003 0 0 0 3 3h14a3.003 3.003 0 0 0 3-3v-6.28L15.163 14H8.837z" fill="currentColor"></path></svg>
67
+ Solutions
68
+ </button>
69
+
70
+
71
+ </div></li>
72
+ <li><a class="group flex items-center px-2 py-0.5 hover:text-gray-500 dark:hover:text-gray-400" href="/pricing">Pricing
73
+ </a></li>
74
+
75
+ <li><div class="relative group">
76
+ <button class="px-2 py-0.5 hover:text-gray-500 dark:hover:text-gray-600 flex items-center " type="button">
77
+ <svg class="mr-1.5 text-gray-500 w-5 group-hover:text-gray-400 dark:text-gray-300 dark:group-hover:text-gray-400" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" viewBox="0 0 32 18" preserveAspectRatio="xMidYMid meet"><path fill-rule="evenodd" clip-rule="evenodd" d="M14.4504 3.30221C14.4504 2.836 14.8284 2.45807 15.2946 2.45807H28.4933C28.9595 2.45807 29.3374 2.836 29.3374 3.30221C29.3374 3.76842 28.9595 4.14635 28.4933 4.14635H15.2946C14.8284 4.14635 14.4504 3.76842 14.4504 3.30221Z" fill="currentColor"></path><path fill-rule="evenodd" clip-rule="evenodd" d="M14.4504 9.00002C14.4504 8.53382 14.8284 8.15588 15.2946 8.15588H28.4933C28.9595 8.15588 29.3374 8.53382 29.3374 9.00002C29.3374 9.46623 28.9595 9.84417 28.4933 9.84417H15.2946C14.8284 9.84417 14.4504 9.46623 14.4504 9.00002Z" fill="currentColor"></path><path fill-rule="evenodd" clip-rule="evenodd" d="M14.4504 14.6978C14.4504 14.2316 14.8284 13.8537 15.2946 13.8537H28.4933C28.9595 13.8537 29.3374 14.2316 29.3374 14.6978C29.3374 15.164 28.9595 15.542 28.4933 15.542H15.2946C14.8284 15.542 14.4504 15.164 14.4504 14.6978Z" fill="currentColor"></path><path fill-rule="evenodd" clip-rule="evenodd" d="M1.94549 6.87377C2.27514 6.54411 2.80962 6.54411 3.13928 6.87377L6.23458 9.96907L9.32988 6.87377C9.65954 6.54411 10.194 6.54411 10.5237 6.87377C10.8533 7.20343 10.8533 7.73791 10.5237 8.06756L6.23458 12.3567L1.94549 8.06756C1.61583 7.73791 1.61583 7.20343 1.94549 6.87377Z" fill="currentColor"></path></svg>
78
+
79
+ </button>
80
+
81
+
82
+ </div></li>
83
+ <li><hr class="h-5 w-0.5 border-none bg-gray-100 dark:bg-gray-800"></li>
84
+ <li><a class="block cursor-pointer px-2 py-0.5 hover:text-gray-500 dark:hover:text-gray-400" href="/login">Log In
85
+ </a></li>
86
+ <li><a class="rounded-full border border-transparent bg-gray-900 px-3 py-1 leading-none text-white hover:border-black hover:bg-white hover:text-black" href="/join">Sign Up
87
+ </a></li></ul></nav></div></header></div>
88
+
89
+ <div class="SVELTE_HYDRATER contents" data-props="{}" data-target="GoogleAnalyticsTracker"></div>
90
+
91
+
92
+ <div class="SVELTE_HYDRATER contents" data-props="{}" data-target="SSOBanner"></div>
93
+
94
+ <main class="flex flex-1 flex-col"><div class="SVELTE_HYDRATER contents" data-props="{&quot;activeTab&quot;:&quot;files&quot;,&quot;author&quot;:{&quot;avatarUrl&quot;:&quot;/avatars/100f5ae3cf3c52faaecdaecd5d8f2881.svg&quot;,&quot;fullname&quot;:&quot;Haotian Zhang&quot;,&quot;name&quot;:&quot;haotiz&quot;,&quot;type&quot;:&quot;user&quot;,&quot;isPro&quot;:false,&quot;isHf&quot;:false},&quot;canReadRepoSettings&quot;:false,&quot;canDisable&quot;:false,&quot;discussionsStats&quot;:{&quot;closed&quot;:0,&quot;open&quot;:2,&quot;total&quot;:2},&quot;query&quot;:{},&quot;space&quot;:{&quot;author&quot;:&quot;haotiz&quot;,&quot;colorFrom&quot;:&quot;indigo&quot;,&quot;colorTo&quot;:&quot;indigo&quot;,&quot;cardData&quot;:{&quot;title&quot;:&quot;Glip Zeroshot Demo&quot;,&quot;emoji&quot;:&quot;⚡&quot;,&quot;colorFrom&quot;:&quot;indigo&quot;,&quot;colorTo&quot;:&quot;indigo&quot;,&quot;sdk&quot;:&quot;gradio&quot;,&quot;sdk_version&quot;:3.3,&quot;app_file&quot;:&quot;app.py&quot;,&quot;pinned&quot;:false,&quot;license&quot;:&quot;mit&quot;},&quot;emoji&quot;:&quot;⚡&quot;,&quot;discussionsDisabled&quot;:false,&quot;duplicationDisabled&quot;:false,&quot;id&quot;:&quot;haotiz/glip-zeroshot-demo&quot;,&quot;isLikedByUser&quot;:false,&quot;isWatchedByUser&quot;:false,&quot;lastModified&quot;:&quot;2023-01-20T22:07:13.000Z&quot;,&quot;likes&quot;:70,&quot;pinned&quot;:false,&quot;private&quot;:false,&quot;gated&quot;:false,&quot;repoType&quot;:&quot;space&quot;,&quot;subdomain&quot;:&quot;haotiz-glip-zeroshot-demo&quot;,&quot;sdk&quot;:&quot;gradio&quot;,&quot;sdkVersion&quot;:3.3,&quot;title&quot;:&quot;Glip Zeroshot Demo&quot;,&quot;runtime&quot;:{&quot;stage&quot;:&quot;BUILD_ERROR&quot;,&quot;hardware&quot;:{&quot;current&quot;:null,&quot;requested&quot;:&quot;t4-small&quot;},&quot;storage&quot;:null,&quot;gcTimeout&quot;:3600,&quot;errorMessage&quot;:&quot;Build failed with exit code: 1&quot;,&quot;replicas&quot;:{&quot;current&quot;:1,&quot;requested&quot;:1}},&quot;iframe&quot;:{&quot;host&quot;:&quot;https://haotiz-glip-zeroshot-demo.hf.space&quot;,&quot;src&quot;:&quot;https://haotiz-glip-zeroshot-demo.hf.space&quot;},&quot;secrets&quot;:[],&quot;variables&quot;:[],&quot;sse&quot;:{&quot;url&quot;:&quot;https://api.hf.space/v1/haotiz/glip-zeroshot-demo&quot;,&quot;jwt&quot;:&quot;eyJhbGciOiJFZERTQSJ9.eyJyZWFkIjp0cnVlLCJpYXQiOjE3MDE1NjE0NjgsInN1YiI6Ii9zcGFjZXMvaGFvdGl6L2dsaXAtemVyb3Nob3QtZGVtbyIsImV4cCI6MTcwMTY0Nzg2OCwiaXNzIjoiaHR0cHM6Ly9odWdnaW5nZmFjZS5jbyJ9._Sgi9vT1KIwQ79sdMBkDyj2u-e_9-IiYHbX_o8saV71naPWywLpwN_R7RxTDCTSM-xT1NLSLUP-KpiduX2z4Ag&quot;},&quot;linkedModels&quot;:[{&quot;downloads&quot;:55164380,&quot;gated&quot;:false,&quot;id&quot;:&quot;bert-base-uncased&quot;,&quot;lastModified&quot;:&quot;2023-06-30T01:42:19.000Z&quot;,&quot;likes&quot;:1221,&quot;pipeline_tag&quot;:&quot;fill-mask&quot;,&quot;private&quot;:false,&quot;repoType&quot;:&quot;model&quot;,&quot;isLikedByUser&quot;:false},{&quot;downloads&quot;:15091479,&quot;gated&quot;:false,&quot;id&quot;:&quot;roberta-base&quot;,&quot;lastModified&quot;:&quot;2023-03-06T15:14:53.000Z&quot;,&quot;likes&quot;:247,&quot;pipeline_tag&quot;:&quot;fill-mask&quot;,&quot;private&quot;:false,&quot;repoType&quot;:&quot;model&quot;,&quot;isLikedByUser&quot;:false},{&quot;author&quot;:&quot;openai&quot;,&quot;authorData&quot;:{&quot;avatarUrl&quot;:&quot;https://aeiljuispo.cloudimg.io/v7/https://cdn-uploads.huggingface.co/production/uploads/1620805164087-5ec0135ded25d76864d553f1.png?w=200&amp;h=200&amp;f=face&quot;,&quot;fullname&quot;:&quot;OpenAI&quot;,&quot;name&quot;:&quot;openai&quot;,&quot;type&quot;:&quot;org&quot;,&quot;isHf&quot;:false,&quot;isEnterprise&quot;:false},&quot;downloads&quot;:8258779,&quot;gated&quot;:false,&quot;id&quot;:&quot;openai/clip-vit-base-patch32&quot;,&quot;lastModified&quot;:&quot;2022-10-04T09:42:04.000Z&quot;,&quot;likes&quot;:277,&quot;pipeline_tag&quot;:&quot;zero-shot-image-classification&quot;,&quot;private&quot;:false,&quot;repoType&quot;:&quot;model&quot;,&quot;isLikedByUser&quot;:false},{&quot;downloads&quot;:5298681,&quot;gated&quot;:false,&quot;id&quot;:&quot;bert-base-cased&quot;,&quot;lastModified&quot;:&quot;2022-11-16T15:18:28.000Z&quot;,&quot;likes&quot;:163,&quot;pipeline_tag&quot;:&quot;fill-mask&quot;,&quot;private&quot;:false,&quot;repoType&quot;:&quot;model&quot;,&quot;isLikedByUser&quot;:false}],&quot;linkedDatasets&quot;:[],&quot;linkedCollections&quot;:[]}}" data-target="SpaceHeader"><header class="from-gray-50-to-white border-b border-gray-100 bg-gradient-to-t via-white dark:via-gray-950 pt-4 xl:pt-0"><div class="container relative flex flex-col xl:flex-row"><h1 class="flex flex-wrap items-center leading-tight gap-y-1 text-lg xl:flex-none"><a href="/spaces" class="group flex items-center"><svg class="mr-1 text-gray-400" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M7.80914 18.7462V24.1907H13.2536V18.7462H7.80914Z" fill="#FF3270"></path><path d="M18.7458 18.7462V24.1907H24.1903V18.7462H18.7458Z" fill="#861FFF"></path><path d="M7.80914 7.80982V13.2543H13.2536V7.80982H7.80914Z" fill="#097EFF"></path><path fill-rule="evenodd" clip-rule="evenodd" d="M4 6.41775C4 5.08246 5.08246 4 6.41775 4H14.6457C15.7626 4 16.7026 4.75724 16.9802 5.78629C18.1505 4.67902 19.7302 4 21.4685 4C25.0758 4 28.0003 6.92436 28.0003 10.5317C28.0003 12.27 27.3212 13.8497 26.2139 15.02C27.243 15.2977 28.0003 16.2376 28.0003 17.3545V25.5824C28.0003 26.9177 26.9177 28.0003 25.5824 28.0003H17.0635H14.9367H6.41775C5.08246 28.0003 4 26.9177 4 25.5824V15.1587V14.9367V6.41775ZM7.80952 7.80952V13.254H13.254V7.80952H7.80952ZM7.80952 24.1907V18.7462H13.254V24.1907H7.80952ZM18.7462 24.1907V18.7462H24.1907V24.1907H18.7462ZM18.7462 10.5317C18.7462 9.0283 19.9651 7.80952 21.4685 7.80952C22.9719 7.80952 24.1907 9.0283 24.1907 10.5317C24.1907 12.0352 22.9719 13.254 21.4685 13.254C19.9651 13.254 18.7462 12.0352 18.7462 10.5317Z" fill="black"></path><path d="M21.4681 7.80982C19.9647 7.80982 18.7458 9.02861 18.7458 10.5321C18.7458 12.0355 19.9647 13.2543 21.4681 13.2543C22.9715 13.2543 24.1903 12.0355 24.1903 10.5321C24.1903 9.02861 22.9715 7.80982 21.4681 7.80982Z" fill="#FFD702"></path></svg>
95
+ <span class="mr-2.5 font-semibold text-gray-400 group-hover:text-gray-500">Spaces:</span></a>
96
+ <div class="group flex flex-none items-center"><div class="relative mr-1.5 flex items-center">
97
+
98
+ <img alt="" class="w-3.5 h-3.5 rounded-full " src="/avatars/100f5ae3cf3c52faaecdaecd5d8f2881.svg" crossorigin="anonymous"></div>
99
+ <a href="/haotiz" class="text-gray-400 hover:text-blue-600">haotiz</a>
100
+ <div class="mx-0.5 text-gray-300">/</div></div>
101
+
102
+ <div class="max-w-full xl:flex xl:min-w-0 xl:flex-nowrap xl:items-center xl:gap-x-1"><a class="break-words font-mono font-semibold hover:text-blue-600 text-[1.07rem] xl:truncate" href="/spaces/haotiz/glip-zeroshot-demo">glip-zeroshot-demo</a>
103
+ <button class="relative text-xs mr-3 inline-flex cursor-pointer items-center text-sm focus:outline-none mx-0.5 text-gray-600 " title="Copy space name to clipboard" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
104
+
105
+ </button></div>
106
+ <div class="inline-flex items-center overflow-hidden whitespace-nowrap rounded-md border bg-white text-sm leading-none text-gray-500 mr-2"><button class="relative flex items-center px-1.5 py-1 hover:bg-gradient-to-t focus:outline-none overflow-hidden from-red-50 to-transparent dark:from-red-900 dark:to-red-800" title="Like"><svg class="left-1.5 absolute" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32" fill="currentColor"><path d="M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13,5.64,15.64a5.7,5.7,0,0,1,0-8,5.48,5.48,0,0,1,7.82,0L16,10.24l2.53-2.58A5.44,5.44,0,0,1,22.45,6m0-2a7.47,7.47,0,0,0-5.34,2.24L16,7.36,14.89,6.24a7.49,7.49,0,0,0-10.68,0,7.72,7.72,0,0,0,0,10.82L16,29,27.79,17.06a7.72,7.72,0,0,0,0-10.82A7.49,7.49,0,0,0,22.45,4Z"></path></svg>
107
+
108
+
109
+ <span class="ml-4 pl-0.5">like</span></button>
110
+ <button class="flex items-center border-l px-1.5 py-1 text-gray-400 hover:bg-gray-50 focus:bg-gray-100 focus:outline-none dark:hover:bg-gray-900 dark:focus:bg-gray-800" title="See users who liked this repository">70</button></div>
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+
120
+ <div class="inline-flex select-none items-center overflow-hidden font-mono text-xs flex-shrink-0 mr-2 rounded-lg border leading-none dark:bg-gray-900
121
+ border-red-100
122
+ text-red-700 dark:text-red-500"><div class="inline-flex items-center px-2 py-[0.32rem] dark:bg-gray-900
123
+
124
+ border-red-100
125
+ bg-red-50">
126
+ Build error
127
+ </div>
128
+ </div>
129
+
130
+
131
+
132
+
133
+ <div class="sm:hidden"><div class="relative ">
134
+ <button class="btn px-1 py-1 text-sm translate-y-0 " type="button">
135
+
136
+ <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="p-px" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><circle cx="16" cy="7" r="3" fill="currentColor"></circle><circle cx="16" cy="16" r="3" fill="currentColor"></circle><circle cx="16" cy="25" r="3" fill="currentColor"></circle></svg>
137
+ <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" class="absolute right-[-0.25rem] bottom-[-0.25rem] rounded-sm bg-gray-50 p-px text-[0.85rem] text-gray-500 dark:bg-gray-925" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 12 12"><path fill="currentColor" d="M7.975 3.489a.438.438 0 0 1 0 .618L4.262 7.82a.416.416 0 0 1-.307.126.427.427 0 0 1-.311-.126.438.438 0 0 1 0-.618L7.357 3.49a.438.438 0 0 1 .618 0ZM6.427 8.132 4.88 9.675a2.17 2.17 0 0 1-3.09 0 2.188 2.188 0 0 1 0-3.09l1.542-1.548a.437.437 0 0 0-.618-.619L1.166 5.966a3.063 3.063 0 0 0 4.332 4.332L7.046 8.75a.438.438 0 0 0-.619-.618Zm4.026-7.121a3.063 3.063 0 0 0-4.332 0L4.573 2.559a.438.438 0 0 0 .618.618L6.74 1.635a2.171 2.171 0 0 1 3.09 0 2.188 2.188 0 0 1 0 3.09L8.287 6.273a.432.432 0 0 0 0 .618.421.421 0 0 0 .475.097.438.438 0 0 0 .143-.097l1.548-1.548a3.068 3.068 0 0 0 0-4.332Z"></path></svg>
138
+
139
+ </button>
140
+
141
+
142
+ </div></div></h1>
143
+
144
+
145
+ <div class="flex flex-col-reverse gap-x-2 sm:flex-row sm:items-center sm:justify-between xl:ml-auto"><div class="-mb-px flex h-12 items-center overflow-x-auto overflow-y-hidden sm:h-[3.25rem]"><a class="tab-alternate " href="/spaces/haotiz/glip-zeroshot-demo"><svg class="mr-1.5 text-gray-400 flex-none" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><path class="uim-quaternary" d="M20.23 7.24L12 12L3.77 7.24a1.98 1.98 0 0 1 .7-.71L11 2.76c.62-.35 1.38-.35 2 0l6.53 3.77c.29.173.531.418.7.71z" opacity=".25" fill="currentColor"></path><path class="uim-tertiary" d="M12 12v9.5a2.09 2.09 0 0 1-.91-.21L4.5 17.48a2.003 2.003 0 0 1-1-1.73v-7.5a2.06 2.06 0 0 1 .27-1.01L12 12z" opacity=".5" fill="currentColor"></path><path class="uim-primary" d="M20.5 8.25v7.5a2.003 2.003 0 0 1-1 1.73l-6.62 3.82c-.275.13-.576.198-.88.2V12l8.23-4.76c.175.308.268.656.27 1.01z" fill="currentColor"></path></svg>
146
+ App
147
+
148
+
149
+ </a><a class="tab-alternate active" href="/spaces/haotiz/glip-zeroshot-demo/tree/main"><svg class="mr-1.5 text-gray-400 flex-none" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><path class="uim-tertiary" d="M21 19h-8a1 1 0 0 1 0-2h8a1 1 0 0 1 0 2zm0-4h-8a1 1 0 0 1 0-2h8a1 1 0 0 1 0 2zm0-8h-8a1 1 0 0 1 0-2h8a1 1 0 0 1 0 2zm0 4h-8a1 1 0 0 1 0-2h8a1 1 0 0 1 0 2z" opacity=".5" fill="currentColor"></path><path class="uim-primary" d="M9 19a1 1 0 0 1-1-1V6a1 1 0 0 1 2 0v12a1 1 0 0 1-1 1zm-6-4.333a1 1 0 0 1-.64-1.769L3.438 12l-1.078-.898a1 1 0 0 1 1.28-1.538l2 1.667a1 1 0 0 1 0 1.538l-2 1.667a.999.999 0 0 1-.64.231z" fill="currentColor"></path></svg>
150
+ <span class="xl:hidden">Files</span>
151
+ <span class="hidden xl:inline">Files</span>
152
+
153
+
154
+ </a><a class="tab-alternate " href="/spaces/haotiz/glip-zeroshot-demo/discussions"><svg class="mr-1.5 text-gray-400 flex-none" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M20.6081 3C21.7684 3 22.8053 3.49196 23.5284 4.38415C23.9756 4.93678 24.4428 5.82749 24.4808 7.16133C24.9674 7.01707 25.4353 6.93643 25.8725 6.93643C26.9833 6.93643 27.9865 7.37587 28.696 8.17411C29.6075 9.19872 30.0124 10.4579 29.8361 11.7177C29.7523 12.3177 29.5581 12.8555 29.2678 13.3534C29.8798 13.8646 30.3306 14.5763 30.5485 15.4322C30.719 16.1032 30.8939 17.5006 29.9808 18.9403C30.0389 19.0342 30.0934 19.1319 30.1442 19.2318C30.6932 20.3074 30.7283 21.5229 30.2439 22.6548C29.5093 24.3704 27.6841 25.7219 24.1397 27.1727C21.9347 28.0753 19.9174 28.6523 19.8994 28.6575C16.9842 29.4379 14.3477 29.8345 12.0653 29.8345C7.87017 29.8345 4.8668 28.508 3.13831 25.8921C0.356375 21.6797 0.754104 17.8269 4.35369 14.1131C6.34591 12.058 7.67023 9.02782 7.94613 8.36275C8.50224 6.39343 9.97271 4.20438 12.4172 4.20438H12.4179C12.6236 4.20438 12.8314 4.2214 13.0364 4.25468C14.107 4.42854 15.0428 5.06476 15.7115 6.02205C16.4331 5.09583 17.134 4.359 17.7682 3.94323C18.7242 3.31737 19.6794 3 20.6081 3ZM20.6081 5.95917C20.2427 5.95917 19.7963 6.1197 19.3039 6.44225C17.7754 7.44319 14.8258 12.6772 13.7458 14.7131C13.3839 15.3952 12.7655 15.6837 12.2086 15.6837C11.1036 15.6837 10.2408 14.5497 12.1076 13.1085C14.9146 10.9402 13.9299 7.39584 12.5898 7.1776C12.5311 7.16799 12.4731 7.16355 12.4172 7.16355C11.1989 7.16355 10.6615 9.33114 10.6615 9.33114C10.6615 9.33114 9.0863 13.4148 6.38031 16.206C3.67434 18.998 3.5346 21.2388 5.50675 24.2246C6.85185 26.2606 9.42666 26.8753 12.0653 26.8753C14.8021 26.8753 17.6077 26.2139 19.1799 25.793C19.2574 25.7723 28.8193 22.984 27.6081 20.6107C27.4046 20.212 27.0693 20.0522 26.6471 20.0522C24.9416 20.0522 21.8393 22.6726 20.5057 22.6726C20.2076 22.6726 19.9976 22.5416 19.9116 22.222C19.3433 20.1173 28.552 19.2325 27.7758 16.1839C27.639 15.6445 27.2677 15.4256 26.746 15.4263C24.4923 15.4263 19.4358 19.5181 18.3759 19.5181C18.2949 19.5181 18.2368 19.4937 18.2053 19.4419C17.6743 18.557 17.9653 17.9394 21.7082 15.6009C25.4511 13.2617 28.0783 11.8545 26.5841 10.1752C26.4121 9.98141 26.1684 9.8956 25.8725 9.8956C23.6001 9.89634 18.2311 14.9403 18.2311 14.9403C18.2311 14.9403 16.7821 16.496 15.9057 16.496C15.7043 16.496 15.533 16.4139 15.4169 16.2112C14.7956 15.1296 21.1879 10.1286 21.5484 8.06535C21.7928 6.66715 21.3771 5.95917 20.6081 5.95917Z" fill="#FF9D00"></path><path d="M5.50686 24.2246C3.53472 21.2387 3.67446 18.9979 6.38043 16.206C9.08641 13.4147 10.6615 9.33111 10.6615 9.33111C10.6615 9.33111 11.2499 6.95933 12.59 7.17757C13.93 7.39581 14.9139 10.9401 12.1069 13.1084C9.29997 15.276 12.6659 16.7489 13.7459 14.713C14.8258 12.6772 17.7747 7.44316 19.304 6.44221C20.8326 5.44128 21.9089 6.00204 21.5484 8.06532C21.188 10.1286 14.795 15.1295 15.4171 16.2118C16.0391 17.2934 18.2312 14.9402 18.2312 14.9402C18.2312 14.9402 25.0907 8.49588 26.5842 10.1752C28.0776 11.8545 25.4512 13.2616 21.7082 15.6008C17.9646 17.9393 17.6744 18.557 18.2054 19.4418C18.7372 20.3266 26.9998 13.1351 27.7759 16.1838C28.5513 19.2324 19.3434 20.1173 19.9117 22.2219C20.48 24.3274 26.3979 18.2382 27.6082 20.6107C28.8193 22.9839 19.2574 25.7722 19.18 25.7929C16.0914 26.62 8.24723 28.3726 5.50686 24.2246Z" fill="#FFD21E"></path></svg>
155
+ Community
156
+ <div class="ml-1.5 flex h-4 min-w-[1rem] items-center justify-center rounded px-1 text-xs leading-none shadow-sm bg-black text-white dark:bg-gray-800 dark:text-gray-200">2
157
+ </div>
158
+
159
+ </a>
160
+ </div>
161
+
162
+
163
+
164
+ <div class="hidden sm:block mt-2 lg:mt-0"><div class="relative ">
165
+ <button class="btn px-1 py-1 text-base translate-y-px " type="button">
166
+
167
+ <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" class="p-0.5" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><circle cx="16" cy="7" r="3" fill="currentColor"></circle><circle cx="16" cy="16" r="3" fill="currentColor"></circle><circle cx="16" cy="25" r="3" fill="currentColor"></circle></svg>
168
+ <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" class="absolute right-[-0.18rem] bottom-[-0.18rem] rounded-sm bg-gray-50 p-px text-[0.85rem] text-gray-500 dark:bg-gray-925" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 12 12"><path fill="currentColor" d="M7.975 3.489a.438.438 0 0 1 0 .618L4.262 7.82a.416.416 0 0 1-.307.126.427.427 0 0 1-.311-.126.438.438 0 0 1 0-.618L7.357 3.49a.438.438 0 0 1 .618 0ZM6.427 8.132 4.88 9.675a2.17 2.17 0 0 1-3.09 0 2.188 2.188 0 0 1 0-3.09l1.542-1.548a.437.437 0 0 0-.618-.619L1.166 5.966a3.063 3.063 0 0 0 4.332 4.332L7.046 8.75a.438.438 0 0 0-.619-.618Zm4.026-7.121a3.063 3.063 0 0 0-4.332 0L4.573 2.559a.438.438 0 0 0 .618.618L6.74 1.635a2.171 2.171 0 0 1 3.09 0 2.188 2.188 0 0 1 0 3.09L8.287 6.273a.432.432 0 0 0 0 .618.421.421 0 0 0 .475.097.438.438 0 0 0 .143-.097l1.548-1.548a3.068 3.068 0 0 0 0-4.332Z"></path></svg>
169
+
170
+ </button>
171
+
172
+
173
+ </div></div>
174
+ </div></div></header>
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
+
190
+ </div>
191
+
192
+ <div class="container relative flex flex-col md:grid md:space-y-0 w-full md:grid-cols-12 space-y-4 md:gap-6 mb-16"><section class="pt-8 border-gray-100 col-span-full"><header class="flex flex-wrap items-center justify-start pb-2 md:justify-end lg:flex-nowrap"><div class="mr-4 flex min-w-0 basis-auto flex-wrap items-center md:flex-grow md:basis-full lg:basis-auto lg:flex-nowrap"><div class="SVELTE_HYDRATER contents" data-props="{&quot;path&quot;:&quot;docs/intro.md&quot;,&quot;repoName&quot;:&quot;haotiz/glip-zeroshot-demo&quot;,&quot;repoType&quot;:&quot;space&quot;,&quot;rev&quot;:&quot;main&quot;,&quot;refs&quot;:{&quot;branches&quot;:[{&quot;name&quot;:&quot;main&quot;,&quot;ref&quot;:&quot;refs/heads/main&quot;,&quot;targetCommit&quot;:&quot;7f799e88e07dd635fe84c11e57e3f6a08b59b911&quot;}],&quot;tags&quot;:[],&quot;converts&quot;:[]},&quot;view&quot;:&quot;blob&quot;}" data-target="BranchSelector"><div class="relative mr-4 mb-2">
193
+ <button class="text-sm md:text-base btn w-full cursor-pointer text-sm" type="button">
194
+ <svg class="mr-1.5 text-gray-700 dark:text-gray-400" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24" style="transform: rotate(360deg);"><path d="M13 14c-3.36 0-4.46 1.35-4.82 2.24C9.25 16.7 10 17.76 10 19a3 3 0 0 1-3 3a3 3 0 0 1-3-3c0-1.31.83-2.42 2-2.83V7.83A2.99 2.99 0 0 1 4 5a3 3 0 0 1 3-3a3 3 0 0 1 3 3c0 1.31-.83 2.42-2 2.83v5.29c.88-.65 2.16-1.12 4-1.12c2.67 0 3.56-1.34 3.85-2.23A3.006 3.006 0 0 1 14 7a3 3 0 0 1 3-3a3 3 0 0 1 3 3c0 1.34-.88 2.5-2.09 2.86C17.65 11.29 16.68 14 13 14m-6 4a1 1 0 0 0-1 1a1 1 0 0 0 1 1a1 1 0 0 0 1-1a1 1 0 0 0-1-1M7 4a1 1 0 0 0-1 1a1 1 0 0 0 1 1a1 1 0 0 0 1-1a1 1 0 0 0-1-1m10 2a1 1 0 0 0-1 1a1 1 0 0 0 1 1a1 1 0 0 0 1-1a1 1 0 0 0-1-1z" fill="currentColor"></path></svg>
195
+ main
196
+ <svg class="-mr-1 text-gray-500" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><path d="M16.293 9.293L12 13.586L7.707 9.293l-1.414 1.414L12 16.414l5.707-5.707z" fill="currentColor"></path></svg></button>
197
+
198
+
199
+ </div></div>
200
+ <div class="mb-2 flex items-center overflow-hidden"><a class="truncate text-gray-800 hover:underline" href="/spaces/haotiz/glip-zeroshot-demo/tree/main">glip-zeroshot-demo</a>
201
+ <span class="mx-1 text-gray-300">/</span>
202
+ <a class="truncate hover:underline dark:text-gray-300" href="/spaces/haotiz/glip-zeroshot-demo/tree/main/docs">docs
203
+ </a>
204
+ <span class="mx-1 text-gray-300">/</span><span class="dark:text-gray-300">intro.md</span></div></div>
205
+
206
+
207
+ </header>
208
+ <div class="SVELTE_HYDRATER contents" data-props="{&quot;commitLast&quot;:{&quot;date&quot;:&quot;2022-09-09T18:53:48.000Z&quot;,&quot;subject&quot;:&quot;initial commit&quot;,&quot;authors&quot;:[{&quot;_id&quot;:&quot;631516348d85ad332fa47b2c&quot;,&quot;avatar&quot;:&quot;/avatars/100f5ae3cf3c52faaecdaecd5d8f2881.svg&quot;,&quot;isHf&quot;:false,&quot;user&quot;:&quot;haotiz&quot;}],&quot;commit&quot;:{&quot;id&quot;:&quot;708dec4d8a2103c25db9eb7c24720af2dccfe72d&quot;,&quot;parentIds&quot;:[&quot;321aba1bb0f2180bfd7f7000fc7cc96699e74092&quot;]},&quot;title&quot;:&quot;initial commit&quot;},&quot;repo&quot;:{&quot;name&quot;:&quot;haotiz/glip-zeroshot-demo&quot;,&quot;type&quot;:&quot;space&quot;}}" data-target="LastCommit"><div class="from-gray-100-to-white flex items-baseline rounded-t-lg border border-b-0 bg-gradient-to-t px-3 py-2 dark:border-gray-800"><img class="mr-2.5 mt-0.5 h-4 w-4 self-center rounded-full" alt="haotiz's picture" src="/avatars/100f5ae3cf3c52faaecdaecd5d8f2881.svg">
209
+ <div class="mr-5 flex flex-none items-center truncate"><a class="hover:underline" href="/haotiz">haotiz
210
+ </a>
211
+
212
+ </div>
213
+ <div class="mr-4 truncate font-mono text-sm text-gray-500 hover:prose-a:underline"><!-- HTML_TAG_START -->initial commit<!-- HTML_TAG_END --></div>
214
+ <a class="rounded border bg-gray-50 px-1.5 text-sm hover:underline dark:border-gray-800 dark:bg-gray-900" href="/spaces/haotiz/glip-zeroshot-demo/commit/708dec4d8a2103c25db9eb7c24720af2dccfe72d">708dec4</a>
215
+
216
+ <time class="ml-auto hidden flex-none truncate pl-2 text-gray-500 dark:text-gray-400 lg:block" datetime="2022-09-09T18:53:48" title="Fri, 09 Sep 2022 18:53:48 GMT">about 1 year ago</time></div></div>
217
+ <div class="flex flex-wrap items-center border px-3 py-1.5 text-sm text-gray-800 dark:border-gray-800 dark:bg-gray-900"><div class="flex items-center gap-3 text-sm font-medium"><a class="rounded-md px-1.5 capitalize bg-gray-200 dark:bg-gray-800" href="/spaces/haotiz/glip-zeroshot-demo/blob/main/docs/intro.md">preview</a>
218
+ <a class="rounded-md px-1.5 capitalize " href="/spaces/haotiz/glip-zeroshot-demo/blob/main/docs/intro.md?code=true">code</a></div>
219
+ <div class="mx-4 text-gray-200">|</div>
220
+ <a class="my-1 mr-4 flex items-center hover:underline " href="/spaces/haotiz/glip-zeroshot-demo/raw/main/docs/intro.md"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32" style="transform: rotate(360deg);"><path d="M31 16l-7 7l-1.41-1.41L28.17 16l-5.58-5.59L24 9l7 7z" fill="currentColor"></path><path d="M1 16l7-7l1.41 1.41L3.83 16l5.58 5.59L8 23l-7-7z" fill="currentColor"></path><path d="M12.419 25.484L17.639 6l1.932.518L14.35 26z" fill="currentColor"></path></svg>
221
+ raw
222
+ </a><a class="my-1 mr-4 flex items-center hover:underline " href="/spaces/haotiz/glip-zeroshot-demo/commits/main/docs/intro.md"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32" style="transform: rotate(360deg);"><path d="M16 4C9.383 4 4 9.383 4 16s5.383 12 12 12s12-5.383 12-12S22.617 4 16 4zm0 2c5.535 0 10 4.465 10 10s-4.465 10-10 10S6 21.535 6 16S10.465 6 16 6zm-1 2v9h7v-2h-5V8z" fill="currentColor"></path></svg>
223
+ history
224
+ </a><a class="my-1 mr-4 flex items-center hover:underline " href="/spaces/haotiz/glip-zeroshot-demo/blame/main/docs/intro.md"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32" style="transform: rotate(360deg);"><path d="M16 2a14 14 0 1 0 14 14A14 14 0 0 0 16 2zm0 26a12 12 0 1 1 12-12a12 12 0 0 1-12 12z" fill="currentColor"></path><path d="M11.5 11a2.5 2.5 0 1 0 2.5 2.5a2.48 2.48 0 0 0-2.5-2.5z" fill="currentColor"></path><path d="M20.5 11a2.5 2.5 0 1 0 2.5 2.5a2.48 2.48 0 0 0-2.5-2.5z" fill="currentColor"></path></svg>
225
+ blame
226
+ </a><a class="my-1 mr-4 flex items-center hover:underline text-green-600 dark:text-gray-300" href="/spaces/haotiz/glip-zeroshot-demo/edit/main/docs/intro.md"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M2 26h28v2H2z" fill="currentColor"></path><path d="M25.4 9c.8-.8.8-2 0-2.8l-3.6-3.6c-.8-.8-2-.8-2.8 0l-15 15V24h6.4l15-15zm-5-5L24 7.6l-3 3L17.4 7l3-3zM6 22v-3.6l10-10l3.6 3.6l-10 10H6z" fill="currentColor"></path></svg>
227
+ contribute
228
+ </a><a class="my-1 mr-4 flex items-center hover:underline " href="/spaces/haotiz/glip-zeroshot-demo/delete/main/docs/intro.md"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M12 12h2v12h-2z" fill="currentColor"></path><path d="M18 12h2v12h-2z" fill="currentColor"></path><path d="M4 6v2h2v20a2 2 0 0 0 2 2h16a2 2 0 0 0 2-2V8h2V6zm4 22V8h16v20z" fill="currentColor"></path><path d="M12 2h8v2h-8z" fill="currentColor"></path></svg>
229
+ delete
230
+ </a>
231
+ <div class="mr-4 flex items-center text-gray-400"><svg class="text-gray-300 text-sm mr-1.5 -translate-y-px" width="1em" height="1em" viewBox="0 0 22 28" fill="none" xmlns="http://www.w3.org/2000/svg"><path fill-rule="evenodd" clip-rule="evenodd" d="M15.3634 10.3639C15.8486 10.8491 15.8486 11.6357 15.3634 12.1209L10.9292 16.5551C10.6058 16.8785 10.0814 16.8785 9.7579 16.5551L7.03051 13.8277C6.54532 13.3425 6.54532 12.5558 7.03051 12.0707C7.51569 11.5855 8.30234 11.5855 8.78752 12.0707L9.7579 13.041C10.0814 13.3645 10.6058 13.3645 10.9292 13.041L13.6064 10.3639C14.0916 9.8787 14.8782 9.8787 15.3634 10.3639Z" fill="currentColor"></path><path fill-rule="evenodd" clip-rule="evenodd" d="M10.6666 27.12C4.93329 25.28 0 19.2267 0 12.7867V6.52001C0 5.40001 0.693334 4.41334 1.73333 4.01334L9.73333 1.01334C10.3333 0.786673 11 0.786673 11.6 1.02667L19.6 4.02667C20.1083 4.21658 20.5465 4.55701 20.8562 5.00252C21.1659 5.44803 21.3324 5.97742 21.3333 6.52001V12.7867C21.3333 19.24 16.4 25.28 10.6666 27.12Z" fill="currentColor" fill-opacity="0.22"></path><path d="M10.0845 1.94967L10.0867 1.94881C10.4587 1.8083 10.8666 1.81036 11.2286 1.95515L11.2387 1.95919L11.2489 1.963L19.2489 4.963L19.25 4.96342C19.5677 5.08211 19.8416 5.29488 20.0351 5.57333C20.2285 5.85151 20.3326 6.18203 20.3333 6.52082C20.3333 6.52113 20.3333 6.52144 20.3333 6.52176L20.3333 12.7867C20.3333 18.6535 15.8922 24.2319 10.6666 26.0652C5.44153 24.2316 1 18.6409 1 12.7867V6.52001C1 5.82357 1.42893 5.20343 2.08883 4.94803L10.0845 1.94967Z" stroke="currentColor" stroke-opacity="0.30" stroke-width="2"></path></svg>
232
+
233
+ No virus
234
+ </div>
235
+
236
+ <div class="dark:text-gray-300 sm:ml-auto">1.19 kB</div></div>
237
+
238
+ <div class="relative min-h-[100px] rounded-b-lg border border-t-0 leading-tight dark:border-gray-800 dark:bg-gray-925">
239
+ <div class="py-4 px-4 sm:px-6 prose hf-sanitized hf-sanitized-aKOIK5UWHeZSqPRRfdqN9">
240
+ <!-- HTML_TAG_START --><p><a rel="noopener nofollow" href="https://arxiv.org/abs/2112.03857">"<strong>GLIP: Grounded Language-Image Pre-training. CVPR 2022, Best Paper Finalist</strong>"</a></p>
241
+ <p>This is the HuggingFace Gradio Demo for GLIP. The model requires an image, and a text to be the inputs. The text input can either be a natural sentence description (grounding), or a simple concatenation of some random categories (object detection).</p>
242
+ <p>The paper presents a grounded language-image pre-training (GLIP) model for learning object-level, language-aware, and semantic-rich visual representations. GLIP unifies object detection and phrase grounding for pre-training. The unification brings two benefits: 1) it allows GLIP to learn from both detection and grounding data to improve both tasks and bootstrap a good grounding model; 2) GLIP can leverage massive image-text pairs by generating grounding boxes in a self-training fashion, making the learned representation semantic-rich.</p>
243
+ <p>Code: <a rel="noopener nofollow" href="https://github.com/microsoft/GLIP">https://github.com/microsoft/GLIP</a></p>
244
+ <p><strong>News</strong>: We are also holding an ODinW challenge at <a rel="noopener nofollow" href="https://computer-vision-in-the-wild.github.io/eccv-2022/">the CV in the Wild Workshop @ ECCV 2022</a>. We hope our open-source code encourage the community to participate in this challenge!</p>
245
+ <!-- HTML_TAG_END --></div></div></section></div></main>
246
+ </div>
247
+
248
+ <script>
249
+ import("/front/build/kube-745aab2/index.js");
250
+ window.moonSha = "kube-745aab2/";
251
+ window.hubConfig = JSON.parse(`{"features":{"signupDisabled":false},"sshGitUrl":"git@hf.co","moonHttpUrl":"https://huggingface.co","captchaApiKey":"bd5f2066-93dc-4bdd-a64b-a24646ca3859","captchaDisabledOnSignup":true,"datasetsServerPublicUrl":"https://datasets-server.huggingface.co","stripePublicKey":"pk_live_x2tdjFXBCvXo2FFmMybezpeM00J6gPCAAc","environment":"production","userAgent":"HuggingFace (production)"}`);
252
+ </script>
253
+
254
+ <!-- Stripe -->
255
+ <script>
256
+ if (["hf.co", "huggingface.co"].includes(window.location.hostname)) {
257
+ const script = document.createElement("script");
258
+ script.src = "https://js.stripe.com/v3/";
259
+ script.async = true;
260
+ document.head.appendChild(script);
261
+ }
262
+ </script>
263
+
264
+ <!-- Google analytics v4 -->
265
+ <script>
266
+ if (["hf.co", "huggingface.co"].includes(window.location.hostname)) {
267
+ const script = document.createElement("script");
268
+ script.src = "https://www.googletagmanager.com/gtag/js?id=G-8Q63TH4CSL";
269
+ script.async = true;
270
+ document.head.appendChild(script);
271
+
272
+ window.dataLayer = window.dataLayer || [];
273
+ function gtag() {
274
+ if (window.dataLayer !== undefined) {
275
+ window.dataLayer.push(arguments);
276
+ }
277
+ }
278
+ gtag("js", new Date());
279
+ gtag("config", "G-8Q63TH4CSL", { page_path: "/spaces/haotiz/glip-zeroshot-demo/blob/main/docs/intro.md" });
280
+ /// ^ See https://developers.google.com/analytics/devguides/collection/gtagjs/pages
281
+ gtag("consent", "default", { ad_storage: "denied", analytics_storage: "denied" });
282
+ /// ^ See https://developers.google.com/tag-platform/gtagjs/reference#consent
283
+ /// TODO: ask the user for their consent and update this with gtag('consent', 'update')
284
+ }
285
+ </script>
286
+ </body>
287
+ </html>
maskrcnn_benchmark/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
maskrcnn_benchmark/config/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ from .defaults import _C as cfg
3
+ from .paths_catalog import try_to_find
maskrcnn_benchmark/config/defaults.py ADDED
@@ -0,0 +1,982 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ import os
3
+
4
+ from yacs.config import CfgNode as CN
5
+
6
+ # -----------------------------------------------------------------------------
7
+ # Convention about Training / Test specific parameters
8
+ # -----------------------------------------------------------------------------
9
+ # Whenever an argument can be either used for training or for testing, the
10
+ # corresponding name will be post-fixed by a _TRAIN for a training parameter,
11
+ # or _TEST for a test-specific parameter.
12
+ # For example, the number of images during training will be
13
+ # IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be
14
+ # IMAGES_PER_BATCH_TEST
15
+
16
+ # -----------------------------------------------------------------------------
17
+ # Config definition
18
+ # -----------------------------------------------------------------------------
19
+
20
+ _C = CN()
21
+
22
+ _C.MODEL = CN()
23
+ _C.MODEL.RPN_ONLY = False
24
+ _C.MODEL.BOX_ON = True
25
+ _C.MODEL.MASK_ON = False
26
+ _C.MODEL.KEYPOINT_ON = False
27
+ _C.MODEL.DEVICE = "cuda"
28
+
29
+ _C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN"
30
+
31
+ _C.MODEL.RPN_ARCHITECTURE = "RPN"
32
+ _C.MODEL.DEBUG = False # add debug flag
33
+ _C.MODEL.ONNX = False # add onnx flag
34
+
35
+ # If the WEIGHT starts with a catalog://, like :R-50, the code will look for
36
+ # the path in paths_catalog. Else, it will use it as the specified absolute
37
+ # path
38
+ _C.MODEL.WEIGHT = ""
39
+ _C.MODEL.PRETRAIN_NAME = ""
40
+
41
+ # If LINEAR_PROB = True, only the last linear layers in rpn and roi_head are trainable
42
+ _C.MODEL.LINEAR_PROB = False
43
+
44
+ # -----------------------------------------------------------------------------
45
+ # Multitask Training / Test specific parameters
46
+ # -----------------------------------------------------------------------------
47
+ _C.MODEL.MULTITASK = CN(new_allowed=True)
48
+
49
+ # -----------------------------------------------------------------------------
50
+ # INPUT
51
+ # -----------------------------------------------------------------------------
52
+ _C.INPUT = CN()
53
+ # Size of the smallest side of the image during training
54
+ _C.INPUT.MIN_SIZE_TRAIN = 800 # (800,)
55
+ # Maximum size of the side of the image during training
56
+ _C.INPUT.MAX_SIZE_TRAIN = 1333
57
+ # Size of the smallest side of the image during testing
58
+ _C.INPUT.MIN_SIZE_TEST = 800
59
+ # Maximum size of the side of the image during testing
60
+ _C.INPUT.MAX_SIZE_TEST = 1333
61
+ # Values to be used for image normalization
62
+ _C.INPUT.PIXEL_MEAN = [102.9801, 115.9465, 122.7717]
63
+ # Values to be used for image normalization
64
+ _C.INPUT.PIXEL_STD = [1.0, 1.0, 1.0]
65
+ # Convert image to BGR format (for Caffe2 models), in range 0-255
66
+ _C.INPUT.TO_BGR255 = True
67
+ _C.INPUT.FORMAT = ""
68
+ _C.INPUT.FIX_RES = False
69
+
70
+ # -----------------------------------------------------------------------------
71
+ # Augmentation
72
+ # -----------------------------------------------------------------------------
73
+ _C.AUGMENT = CN()
74
+ _C.AUGMENT.USE_RA = 0
75
+ _C.AUGMENT.FLIP_PROB_TRAIN = 0.5
76
+ _C.AUGMENT.VERTICAL_FLIP_PROB_TRAIN = 0.0
77
+ _C.AUGMENT.MULT_MIN_SIZE_TRAIN = ()
78
+
79
+ _C.AUGMENT.BRIGHTNESS = 0.0
80
+ _C.AUGMENT.CONTRAST = 0.0
81
+ _C.AUGMENT.SATURATION = 0.0
82
+ _C.AUGMENT.HUE = 0.0
83
+
84
+ _C.AUGMENT.CROP_PROB = 0.5
85
+ _C.AUGMENT.CROP_MIN_IOUS = (0.1, 0.3, 0.5, 0.7, 0.9)
86
+ _C.AUGMENT.CROP_MIN_SIZE = 0.3
87
+
88
+ _C.AUGMENT.AFFINE_PROB = 0.5
89
+ _C.AUGMENT.AFFINE_R = (-10, 10)
90
+ _C.AUGMENT.AFFINE_T = (0.1, 0.1)
91
+ _C.AUGMENT.AFFINE_S = (0.9, 1.1)
92
+ _C.AUGMENT.AFFINE_SHEAR = (-2, 2)
93
+ _C.AUGMENT.AFFINE_FILL = (127.5, 127.5, 127.5)
94
+
95
+ _C.AUGMENT.ERASE_PROB = 0.0
96
+ _C.AUGMENT.ERASE_L = 0.02
97
+ _C.AUGMENT.ERASE_H = 1 / 3
98
+ _C.AUGMENT.ERASE_MIN_ASPECT = 0.3
99
+ _C.AUGMENT.ERASE_MODE = "const"
100
+ _C.AUGMENT.ERASE_MAX_COUNT = 1
101
+ _C.AUGMENT.ERASE_MAX_OVERLAP = 0.6
102
+ _C.AUGMENT.ERASE_MAX_VALUE = 255
103
+
104
+ _C.AUGMENT.MOSAIC_PROB = 0.0
105
+ _C.AUGMENT.MOSAIC_SHIFT = 0.5
106
+ _C.AUGMENT.MOSAIC_SIZE = -1
107
+
108
+ _C.AUGMENT.PASTE_PROB = 0.0
109
+ _C.AUGMENT.PASTE_CAT = ()
110
+ _C.AUGMENT.PASTE_NUM = 2
111
+ # -----------------------------------------------------------------------------
112
+ # Dataset
113
+ # -----------------------------------------------------------------------------
114
+ _C.DATASETS = CN()
115
+ # List of the dataset names for training, as present in paths_catalog.py
116
+ _C.DATASETS.TRAIN = ()
117
+ # List of the dataset names for testing, as present in paths_catalog.py
118
+ _C.DATASETS.TEST = ()
119
+ # Use is_crowd label
120
+ _C.DATASETS.USE_CROWD = False
121
+ _C.DATASETS.CLASS_AGNOSTIC = False
122
+ _C.DATASETS.CLASS_CONCAT = False
123
+ _C.DATASETS.MAX_BOX = -1
124
+ _C.DATASETS.SAMPLE_RATIO = 0.0
125
+ _C.DATASETS.FEW_SHOT = 0
126
+ # SHUFFLE_SEED != 0 means shuffle the dataset in the few shot setting
127
+ _C.DATASETS.SHUFFLE_SEED = 0
128
+ _C.DATASETS.PREDEFINED_TEXT = ""
129
+ _C.DATASETS.ALTERNATIVE_TRAINING = False
130
+ _C.DATASETS.MULTISTAGE_TRAINING = False
131
+ _C.DATASETS.REGISTER = CN(new_allowed=True)
132
+ _C.DATASETS.BOX_THRESHOLD = 0.1
133
+ # Duplicate Dataset
134
+ _C.DATASETS.COCO_COPY = 1
135
+ _C.DATASETS.LVIS_COPY = 1
136
+ _C.DATASETS.FLICKR_COPY = 1
137
+ _C.DATASETS.MIXED_COPY = 1
138
+ _C.DATASETS.OBJECT365_COPY = 1
139
+ _C.DATASETS.VG_COPY = 1
140
+ _C.DATASETS.OI_COPY = 1
141
+ _C.DATASETS.IN_COPY = 1
142
+ _C.DATASETS.MIXED_GPT_COPY = 1
143
+
144
+ # Duplicate Dataset
145
+ _C.DATASETS.COCO_COPY = 1
146
+ _C.DATASETS.FLICKR_COPY = 1
147
+ _C.DATASETS.MIXED_COPY = 1
148
+ _C.DATASETS.OBJECT365_COPY = 1
149
+ _C.DATASETS.VG_COPY = 1
150
+ _C.DATASETS.OI_COPY = 1
151
+ _C.DATASETS.IN_COPY = 1
152
+ _C.DATASETS.REFCOCO_COPY = 1
153
+ _C.DATASETS.GENERAL_COPY = -1
154
+ _C.DATASETS.GENERAL_COPY_TEST = -1
155
+
156
+ # OD to Grounding
157
+ _C.DATASETS.RANDOM_SAMPLE_NEG = -1
158
+ _C.DATASETS.ADD_DET_PROMPT = False
159
+ _C.DATASETS.ADD_DET_PROMPT_ADVANCED = False
160
+ _C.DATASETS.USE_OD_AUG = False
161
+ _C.DATASETS.USE_COCO_FORMAT = False
162
+ _C.DATASETS.CONTROL_PROB = ()
163
+ _C.DATASETS.DISABLE_SHUFFLE = False
164
+ _C.DATASETS.PROMPT_VERSION = ""
165
+ _C.DATASETS.PROMPT_LIMIT_NEG = -1
166
+ _C.DATASETS.POS_QUESTION_PROB = 0.6
167
+ _C.DATASETS.NEG_QUESTION_PROB = 0.8
168
+ _C.DATASETS.FULL_QUESTION_PROB = 0.5
169
+ _C.DATASETS.ONE_HOT = False
170
+ _C.DATASETS.NO_MINUS_ONE_FOR_ONE_HOT = False
171
+
172
+ _C.DATASETS.DISABLE_CLIP_TO_IMAGE = False
173
+ _C.DATASETS.SEPARATION_TOKENS = " "
174
+
175
+ # LVIS
176
+ _C.DATASETS.LVIS_USE_NORMAL_AP = False
177
+ _C.DATASETS.LVIS_TOPK = 10000
178
+ _C.DATASETS.SPECIAL_SAFEGUARD_FOR_COCO_GROUNDING = False
179
+
180
+ # Caption
181
+ _C.DATASETS.BING_INDEX_LIST = []
182
+ _C.DATASETS.CAPTION_MIN_BOX = 1
183
+ _C.DATASETS.REPLACE_CLEAN_LABEL = False
184
+ _C.DATASETS.FURTHER_SCREEN = False
185
+ _C.DATASETS.CAPTION_CONF = 0.9
186
+ _C.DATASETS.CAPTION_NMS = 0.9
187
+ _C.DATASETS.PACK_RANDOM_CAPTION_NUMBER = 0
188
+ _C.DATASETS.INFERENCE_CAPTION = False
189
+ _C.DATASETS.SAMPLE_NEGATIVE_FOR_GROUNDING_DATA = -1.0
190
+ _C.DATASETS.RANDOM_PACK_PROB = -1.0
191
+ _C.DATASETS.NO_RANDOM_PACK_PROBABILITY = 0.0
192
+ _C.DATASETS.SAFEGUARD_POSITIVE_CAPTION = True
193
+ _C.DATASETS.CAPTION_FORMAT_VERSION = "v1"
194
+ _C.DATASETS.LOCAL_DEBUG = False
195
+
196
+
197
+ # Od in the wild
198
+ _C.DATASETS.PREDEFINED_TEXT = None
199
+ _C.DATASETS.TRAIN_DATASETNAME_SUFFIX = ""
200
+ _C.DATASETS.TEST_DATASETNAME_SUFFIX = ""
201
+ _C.DATASETS.OVERRIDE_CATEGORY = None
202
+ _C.DATASETS.USE_OVERRIDE_CATEGORY = False
203
+ _C.DATASETS.SUPRESS_QUERY = None
204
+ _C.DATASETS.USE_SUPRESS_QUERY = False
205
+ _C.DATASETS.USE_CAPTION_PROMPT = False
206
+ _C.DATASETS.CAPTION_PROMPT = None
207
+
208
+ _C.DATASETS.PREDOWNLOAD_BING = False
209
+ _C.DATASETS.PREDOWNLOAD_WITH_AZCOPY = False
210
+ _C.DATASETS.FLICKR_GT_TYPE = "separate"
211
+
212
+ # PACO
213
+ _C.DATASETS.OD_TO_GROUNDING_VERSION = "legacy"
214
+
215
+ # description
216
+ _C.DATASETS.DESCRIPTION_FILE = None
217
+ _C.DATASETS.SIMILARITY_FILE = None
218
+ _C.DATASETS.CAPTION_VOCAB_FILE = None
219
+
220
+ # caption augmentation
221
+ _C.DATASETS.CAPTION_AUGMENTATION_VOCAB = None
222
+ _C.DATASETS.CAPTION_AUGMENTATION_VERSION = None
223
+
224
+ _C.DATASETS.CC_CAPTION_AUGMENTATION_VERSION = None
225
+
226
+ _C.DATASETS.KEEP_NOUN_RATIO = 0.0
227
+
228
+ # VQA
229
+ _C.DATASETS.DIVER_BOX_FOR_VQA = False
230
+
231
+ # -----------------------------------------------------------------------------
232
+ # DataLoader
233
+ # -----------------------------------------------------------------------------
234
+ _C.DATALOADER = CN()
235
+ # Number of data loading threads
236
+ _C.DATALOADER.NUM_WORKERS = 4
237
+ # If > 0, this enforces that each collated batch should have a size divisible
238
+ # by SIZE_DIVISIBILITY
239
+ _C.DATALOADER.SIZE_DIVISIBILITY = 0
240
+ # If True, each batch should contain only images for which the aspect ratio
241
+ # is compatible. This groups portrait images together, and landscape images
242
+ # are not batched with portrait images.
243
+ _C.DATALOADER.ASPECT_RATIO_GROUPING = True
244
+ # Define min number of keypoints required from GT, for example 10 out of 17
245
+ _C.DATALOADER.MIN_KPS_PER_IMS = 0
246
+ # Use random sampler during training
247
+ _C.DATALOADER.USE_RANDOM_SEED = False
248
+
249
+ _C.DATALOADER.DISTRIBUTE_CHUNK_AMONG_NODE = False
250
+ # ---------------------------------------------------------------------------- #
251
+ # Backbone options
252
+ # ---------------------------------------------------------------------------- #
253
+ _C.MODEL.BACKBONE = CN()
254
+
255
+ # The backbone conv body to use
256
+ # The string must match a function that is imported in modeling.model_builder
257
+ # (e.g., 'FPN.add_fpn_ResNet101_conv5_body' to specify a ResNet-101-FPN
258
+ # backbone)
259
+ _C.MODEL.BACKBONE.CONV_BODY = "R-50-C4"
260
+
261
+ # Add StopGrad at a specified stage so the bottom layers are frozen
262
+ _C.MODEL.BACKBONE.FREEZE_CONV_BODY_AT = 2
263
+ _C.MODEL.BACKBONE.FREEZE = False
264
+ _C.MODEL.BACKBONE.GROUP = 1
265
+ _C.MODEL.BACKBONE.OUT_CHANNELS = 256 * 4
266
+ # Option to reset bn running statics
267
+ _C.MODEL.BACKBONE.RESET_BN = False
268
+ # Backbone Normalization Level
269
+ _C.MODEL.BACKBONE.NORM_LEVEL = 3
270
+ # BN for backbone
271
+ _C.MODEL.BACKBONE.USE_BN = False
272
+ # Sync BN for backbone
273
+ _C.MODEL.BACKBONE.USE_SYNCBN = False
274
+ _C.MODEL.BACKBONE.USE_NSYNCBN = False
275
+ # GN for backbone
276
+ _C.MODEL.BACKBONE.USE_GN = False
277
+ # Evo Norm for backbone
278
+ _C.MODEL.BACKBONE.USE_EN = False
279
+ # Layers for backbone
280
+ _C.MODEL.BACKBONE.USE_DFCONV = False
281
+ _C.MODEL.BACKBONE.USE_DYRELU = False
282
+ _C.MODEL.BACKBONE.USE_SE = False
283
+ _C.MODEL.BACKBONE.LAYER_SETUP = (3, 4, 6, 3)
284
+ _C.MODEL.BACKBONE.LAYER_SEARCH = CN(new_allowed=True)
285
+ _C.MODEL.BACKBONE.OUT_FEATURES = ("stage2", "stage3", "stage4", "stage5")
286
+ _C.MODEL.BACKBONE.FPN_LAYER = ()
287
+ _C.MODEL.BACKBONE.USE_CHECKPOINT = False
288
+ # Add JF efficient det cfgs
289
+ _C.MODEL.BACKBONE.EFFICIENT_DET_START_FROM = 3
290
+ _C.MODEL.BACKBONE.EFFICIENT_DET_COMPOUND = 0
291
+ _C.MODEL.BACKBONE.EFFICIENT_DET_BIFPN_VERSION = 0
292
+
293
+ _C.MODEL.BACKBONE.FUSION_VERSION = "v1" # Whether to use symmetric or non symmetric fusion
294
+
295
+ _C.MODEL.LANGUAGE_BACKBONE = CN()
296
+ _C.MODEL.LANGUAGE_BACKBONE.WEIGHT = ""
297
+ _C.MODEL.LANGUAGE_BACKBONE.FREEZE = False
298
+ _C.MODEL.LANGUAGE_BACKBONE.USE_CHECKPOINT = False
299
+ _C.MODEL.LANGUAGE_BACKBONE.TOKENIZER_TYPE = "bert-base-uncased"
300
+ _C.MODEL.LANGUAGE_BACKBONE.MODEL_TYPE = "bert-base-uncased"
301
+ _C.MODEL.LANGUAGE_BACKBONE.LANG_DIM = 768
302
+ _C.MODEL.LANGUAGE_BACKBONE.MAX_QUERY_LEN = 256
303
+ _C.MODEL.LANGUAGE_BACKBONE.N_LAYERS = 1
304
+ _C.MODEL.LANGUAGE_BACKBONE.UNUSED_TOKEN = 106
305
+ _C.MODEL.LANGUAGE_BACKBONE.MASK_SPECIAL = False
306
+
307
+ _C.MODEL.LANGUAGE_BACKBONE.RNN_TYPE = "lstm"
308
+ _C.MODEL.LANGUAGE_BACKBONE.VARIABLE_LENGTH = True
309
+ _C.MODEL.LANGUAGE_BACKBONE.WORD_EMBEDDING_SIZE = 512
310
+ _C.MODEL.LANGUAGE_BACKBONE.WORD_VEC_SIZE = 512
311
+ _C.MODEL.LANGUAGE_BACKBONE.HIDDEN_SIZE = 512
312
+ _C.MODEL.LANGUAGE_BACKBONE.BIDIRECTIONAL = True
313
+ _C.MODEL.LANGUAGE_BACKBONE.INPUT_DROPOUT_P = 0.5
314
+ _C.MODEL.LANGUAGE_BACKBONE.DROPOUT_P = 0.2
315
+ _C.MODEL.LANGUAGE_BACKBONE.CORPUS_PATH = ""
316
+ _C.MODEL.LANGUAGE_BACKBONE.VOCAB_SIZE = 0
317
+
318
+ _C.MODEL.LANGUAGE_BACKBONE.PAD_MAX = True
319
+ # ---------------------------------------------------------------------------- #
320
+ # FPN options
321
+ # ---------------------------------------------------------------------------- #
322
+ _C.MODEL.FPN = CN()
323
+ _C.MODEL.FPN.FREEZE = False
324
+ _C.MODEL.FPN.USE_GN = False
325
+ _C.MODEL.FPN.USE_RELU = False
326
+ _C.MODEL.FPN.USE_DYRELU = False
327
+ _C.MODEL.FPN.DROP_BLOCK = True
328
+ _C.MODEL.FPN.DROP_PROB = 0.3
329
+ _C.MODEL.FPN.DROP_SIZE = 3
330
+ _C.MODEL.FPN.USE_SPP = False
331
+ _C.MODEL.FPN.USE_PAN = False
332
+ _C.MODEL.FPN.USE_DYHEAD = False
333
+ _C.MODEL.FPN.RETURN_SWINT_FEATURE_BEFORE_FUSION = False
334
+ # ---------------------------------------------------------------------------- #
335
+ # BIFPN options
336
+ # ---------------------------------------------------------------------------- #
337
+ _C.MODEL.BIFPN = CN()
338
+ _C.MODEL.BIFPN.NUM_REPEATS = 1
339
+ _C.MODEL.BIFPN.USE_ATTENTION = True
340
+
341
+ # ---------------------------------------------------------------------------- #
342
+ # Group Norm options
343
+ # ---------------------------------------------------------------------------- #
344
+ _C.MODEL.GROUP_NORM = CN()
345
+ # Number of dimensions per group in GroupNorm (-1 if using NUM_GROUPS)
346
+ _C.MODEL.GROUP_NORM.DIM_PER_GP = -1
347
+ # Number of groups in GroupNorm (-1 if using DIM_PER_GP)
348
+ _C.MODEL.GROUP_NORM.NUM_GROUPS = 16
349
+ # GroupNorm's small constant in the denominator
350
+ _C.MODEL.GROUP_NORM.EPSILON = 1e-5
351
+
352
+ # ---------------------------------------------------------------------------- #
353
+ # Evo Norm options
354
+ # ---------------------------------------------------------------------------- #
355
+ _C.MODEL.EVO_NORM = CN()
356
+ # Number of groups in EvoNorm (-1 if using DIM_PER_GP)
357
+ _C.MODEL.EVO_NORM.NUM_GROUPS = 8
358
+ # EvoNorm's small constant in the denominator
359
+ _C.MODEL.EVO_NORM.EPSILON = 1e-5
360
+
361
+ # ---------------------------------------------------------------------------- #
362
+ # RetinaNet Options (Follow the Detectron version)
363
+ # ---------------------------------------------------------------------------- #
364
+ _C.MODEL.RETINANET = CN()
365
+ # This is the number of foreground classes and background.
366
+ _C.MODEL.RETINANET.NUM_CLASSES = 81
367
+ # Convolutions to use in the cls and bbox tower
368
+ # NOTE: this doesn't include the last conv for logits
369
+ _C.MODEL.RETINANET.NUM_CONVS = 4
370
+ # During inference, #locs to select based on cls score before NMS is performed
371
+ # per FPN level
372
+ _C.MODEL.RETINANET.PRE_NMS_TOP_N = 1000
373
+ # Prior prob for the positives at the beginning of training. This is used to set
374
+ # the bias init for the logits layer
375
+ _C.MODEL.RETINANET.PRIOR_PROB = 0.01
376
+ # Inference cls score threshold, anchors with score > INFERENCE_TH are
377
+ # considered for inference
378
+ _C.MODEL.RETINANET.INFERENCE_TH = 0.05
379
+ # NMS threshold used in RetinaNet
380
+ _C.MODEL.RETINANET.NMS_TH = 0.4
381
+ _C.MODEL.RETINANET.DETECTIONS_PER_IMG = 100
382
+
383
+ # ---------------------------------------------------------------------------- #
384
+ # Focal Loss Options (Follow the Detectron version)
385
+ # ---------------------------------------------------------------------------- #
386
+ _C.MODEL.FOCAL = CN()
387
+ # Weight for bbox_regression loss
388
+ _C.MODEL.FOCAL.BBOX_REG_WEIGHT = 4.0
389
+ # Smooth L1 loss beta for bbox regression
390
+ _C.MODEL.FOCAL.BBOX_REG_BETA = 0.11
391
+ # IoU overlap ratio for labeling an anchor as positive
392
+ # Anchors with >= iou overlap are labeled positive
393
+ _C.MODEL.FOCAL.FG_IOU_THRESHOLD = 0.5
394
+ # IoU overlap ratio for labeling an anchor as negative
395
+ # Anchors with < iou overlap are labeled negative
396
+ _C.MODEL.FOCAL.BG_IOU_THRESHOLD = 0.4
397
+ # Focal loss parameter: alpha
398
+ _C.MODEL.FOCAL.LOSS_ALPHA = 0.25
399
+ # Focal loss parameter: gamma
400
+ _C.MODEL.FOCAL.LOSS_GAMMA = 2.0
401
+
402
+ # ---------------------------------------------------------------------------- #
403
+ # FCOS Options
404
+ # ---------------------------------------------------------------------------- #
405
+ _C.MODEL.FCOS = CN()
406
+ _C.MODEL.FCOS.NUM_CLASSES = 81 # the number of classes including background
407
+ _C.MODEL.FCOS.FPN_STRIDES = [8, 16, 32, 64, 128]
408
+ _C.MODEL.FCOS.PRIOR_PROB = 0.01
409
+ _C.MODEL.FCOS.INFERENCE_TH = 0.05
410
+ _C.MODEL.FCOS.NMS_TH = 0.6
411
+ _C.MODEL.FCOS.PRE_NMS_TOP_N = 1000
412
+
413
+ # the number of convolutions used in the cls and bbox tower
414
+ _C.MODEL.FCOS.NUM_CONVS = 4
415
+ # if use deformable conv to align features
416
+ _C.MODEL.FCOS.USE_DFCONV = False
417
+
418
+ # if CENTER_SAMPLING_RADIUS <= 0, it will disable center sampling
419
+ _C.MODEL.FCOS.CENTER_SAMPLING_RADIUS = 0.0
420
+ # IOU_LOSS_TYPE can be "iou", "linear_iou" or "giou"
421
+ _C.MODEL.FCOS.IOU_LOSS_TYPE = "iou"
422
+
423
+ _C.MODEL.FCOS.NORM_REG_TARGETS = False
424
+ _C.MODEL.FCOS.CENTERNESS_ON_REG = False
425
+ _C.MODEL.FCOS.USE_GT_CENTER = False
426
+
427
+ _C.MODEL.FCOS.DETECTIONS_PER_IMG = 100
428
+ _C.MODEL.FCOS.USE_GN = False
429
+ _C.MODEL.FCOS.USE_BN = False
430
+
431
+ _C.MODEL.FCOS.INFERENCE_TH_TRAIN = 0.0
432
+ _C.MODEL.FCOS.PRE_NMS_TOP_N_TRAIN = 3000
433
+ _C.MODEL.FCOS.POST_NMS_TOP_N_TRAIN = 1000
434
+
435
+ # ---------------------------------------------------------------------------- #
436
+ # ATSS Options
437
+ # ---------------------------------------------------------------------------- #
438
+ _C.MODEL.ATSS = CN()
439
+ _C.MODEL.ATSS.NUM_CLASSES = 81 # the number of classes including background
440
+ _C.MODEL.ATSS.PRIOR_PROB = 0.01
441
+ _C.MODEL.ATSS.INFERENCE_TH = 0.05
442
+ _C.MODEL.ATSS.NMS_TH = 0.6
443
+ _C.MODEL.ATSS.PRE_NMS_TOP_N = 1000
444
+
445
+ # the number of convolutions used in the cls and bbox tower
446
+ _C.MODEL.ATSS.NUM_CONVS = 4
447
+ # the channels of convolutions used in the cls and bbox tower
448
+ _C.MODEL.ATSS.CHANNELS = 128
449
+ # if use deformable conv to align features
450
+ _C.MODEL.ATSS.USE_DFCONV = False
451
+
452
+ # topk for selecting candidate positive samples from each level
453
+ _C.MODEL.ATSS.TOPK = 9
454
+
455
+ # Weight for bbox_regression loss
456
+ _C.MODEL.ATSS.REG_LOSS_WEIGHT = 2.0
457
+
458
+ _C.MODEL.ATSS.DETECTIONS_PER_IMG = 100
459
+ _C.MODEL.ATSS.USE_GN = False
460
+ _C.MODEL.ATSS.USE_BN = False
461
+
462
+ _C.MODEL.ATSS.USE_DYRELU = False
463
+ _C.MODEL.ATSS.USE_SE = False
464
+
465
+ _C.MODEL.ATSS.INFERENCE_TH_TRAIN = 0.0
466
+ _C.MODEL.ATSS.PRE_NMS_TOP_N_TRAIN = 3000
467
+ _C.MODEL.ATSS.POST_NMS_TOP_N_TRAIN = 1000
468
+ # ---------------------------------------------------------------------------- #
469
+ # DYHEAD Options
470
+ # ---------------------------------------------------------------------------- #
471
+ _C.MODEL.DYHEAD = CN()
472
+ _C.MODEL.DYHEAD.NUM_CLASSES = 81 # the number of classes including background
473
+ _C.MODEL.DYHEAD.PRIOR_PROB = 0.01
474
+
475
+ # the number of convolutions used in the cls and bbox tower
476
+ _C.MODEL.DYHEAD.NUM_CONVS = 4
477
+ # the channels of convolutions used in the cls and bbox tower
478
+ _C.MODEL.DYHEAD.CHANNELS = 128
479
+ _C.MODEL.DYHEAD.GROUPS = 1
480
+ # if use deformable conv to align features
481
+ _C.MODEL.DYHEAD.USE_DFCONV = False
482
+
483
+ # topk for selecting candidate positive samples from each level
484
+ _C.MODEL.DYHEAD.TOPK = 9
485
+
486
+ _C.MODEL.DYHEAD.SCORE_AGG = "MEAN" # MEAN or MAX, for binary focal loss score aggregation
487
+
488
+ _C.MODEL.DYHEAD.LOG_SCALE = 0.0 # temperature (dot product)
489
+ _C.MODEL.DYHEAD.SHALLOW_LOG_SCALE = 0.0 # # temperature (shallow contrastive)
490
+
491
+ _C.MODEL.DYHEAD.USE_GN = False
492
+ _C.MODEL.DYHEAD.USE_NSYNCBN = False
493
+ _C.MODEL.DYHEAD.USE_SYNCBN = False
494
+
495
+ _C.MODEL.DYHEAD.USE_DYFUSE = False
496
+ _C.MODEL.DYHEAD.USE_DYRELU = False
497
+
498
+ _C.MODEL.DYHEAD.CONV_FUNC = ""
499
+
500
+ # CosineSimOutputLayers: https://github.com/ucbdrive/few-shot-object-detection/blob/master/fsdet/modeling/roi_heads/fast_rcnn.py#L448-L464
501
+ _C.MODEL.DYHEAD.COSINE_SCALE = -1.0
502
+
503
+ _C.MODEL.DYHEAD.FUSE_CONFIG = CN()
504
+ _C.MODEL.DYHEAD.FUSE_CONFIG.EARLY_FUSE_ON = False
505
+ _C.MODEL.DYHEAD.FUSE_CONFIG.TYPE = ""
506
+ _C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_EMB_SIZE = 256
507
+ _C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_OUT_SIZE = 256
508
+ _C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_EMB_DROPOUT = 0.1
509
+ _C.MODEL.DYHEAD.FUSE_CONFIG.JOINT_MLP_LAYERS = 2
510
+
511
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_CLASSIFICATION_LOSS = False
512
+
513
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_TOKEN_LOSS = False
514
+ _C.MODEL.DYHEAD.FUSE_CONFIG.TOKEN_LOSS_WEIGHT = 1.0
515
+ _C.MODEL.DYHEAD.FUSE_CONFIG.TOKEN_GAMMA = 2.0
516
+ _C.MODEL.DYHEAD.FUSE_CONFIG.TOKEN_ALPHA = 0.25
517
+
518
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_DOT_PRODUCT_TOKEN_LOSS = False
519
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_CONTRASTIVE_ALIGN_LOSS = False
520
+ _C.MODEL.DYHEAD.FUSE_CONFIG.CONTRASTIVE_HIDDEN_DIM = 64
521
+ _C.MODEL.DYHEAD.FUSE_CONFIG.CONTRASTIVE_ALIGN_LOSS_WEIGHT = 1.0
522
+ _C.MODEL.DYHEAD.FUSE_CONFIG.DOT_PRODUCT_TOKEN_LOSS_WEIGHT = 1.0
523
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_LAYER_SCALE = True
524
+ _C.MODEL.DYHEAD.FUSE_CONFIG.SEPARATE_BIDIRECTIONAL = False
525
+ _C.MODEL.DYHEAD.FUSE_CONFIG.STABLE_SOFTMAX_2D = False
526
+
527
+ _C.MODEL.DYHEAD.FUSE_CONFIG.DO_LANG_PROJ_OUTSIDE_CHECKPOINT = False
528
+
529
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_FUSED_FEATURES_DOT_PRODUCT = False
530
+
531
+ # Controls for
532
+ _C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_MIN_FOR_UNDERFLOW = False
533
+ _C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_MAX_FOR_OVERFLOW = False
534
+ _C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_BERTATTN_MIN_FOR_UNDERFLOW = False
535
+ _C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_BERTATTN_MAX_FOR_OVERFLOW = False
536
+ _C.MODEL.DYHEAD.FUSE_CONFIG.CLAMP_DOT_PRODUCT = False
537
+
538
+ # MLM Loss
539
+ _C.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS = False
540
+ _C.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS_FOR_ONLY_POSITIVES = True
541
+ _C.MODEL.DYHEAD.FUSE_CONFIG.NO_MASK_FOR_OD = False
542
+ _C.MODEL.DYHEAD.FUSE_CONFIG.NO_MASK_FOR_GOLD = False
543
+ _C.MODEL.DYHEAD.FUSE_CONFIG.MLM_LOSS_COEF = 1.0
544
+ _C.MODEL.DYHEAD.FUSE_CONFIG.MLM_OBJ_FOR_ONLY_POSITIVE = False
545
+
546
+ # Shallow Contrastive Loss (FPN)
547
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_SHALLOW_CONTRASTIVE_LOSS = False
548
+ _C.MODEL.DYHEAD.FUSE_CONFIG.SHALLOW_MAX_POSITIVE_ANCHORS = 100
549
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_SHALLOW_ZERO_PADS = False
550
+ _C.MODEL.DYHEAD.FUSE_CONFIG.SHALLOW_CONTRASTIVE_HIDDEN_DIM = 64
551
+ _C.MODEL.DYHEAD.FUSE_CONFIG.SHALLOW_CONTRASTIVE_LOSS_WEIGHT = 1.0
552
+
553
+ # Span Loss
554
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_SPAN_LOSS = False # will reuse the green light span field to indicate span boundary
555
+ _C.MODEL.DYHEAD.FUSE_CONFIG.SPAN_VERSION = None
556
+ _C.MODEL.DYHEAD.FUSE_CONFIG.MUTE_NOOBJ_TOKEN = False
557
+
558
+
559
+ # Shallow Contrastive Loss (BACKBONE)
560
+ _C.MODEL.DYHEAD.FUSE_CONFIG.USE_BACKBONE_SHALLOW_CONTRASTIVE_LOSS = False
561
+
562
+ _C.MODEL.DYHEAD.FUSE_CONFIG.ADD_LINEAR_LAYER = False
563
+ # Mute non-essential tokens
564
+ _C.MODEL.DYHEAD.FUSE_CONFIG.MUTE_NON_ESSENTIAL_TOKENS = False
565
+ # use checkpoint to save memory
566
+ _C.MODEL.DYHEAD.USE_CHECKPOINT = False
567
+
568
+ # ---------------------------------------------------------------------------- #
569
+ # DYDETR Options
570
+ # ---------------------------------------------------------------------------- #
571
+ _C.MODEL.DYDETR = CN()
572
+ _C.MODEL.DYDETR.NHEADS = 8
573
+ _C.MODEL.DYDETR.DROPOUT = 0.0
574
+ _C.MODEL.DYDETR.DIM_FEEDFORWARD = 2048
575
+ _C.MODEL.DYDETR.ACTIVATION = "relu"
576
+ _C.MODEL.DYDETR.HIDDEN_DIM = 256
577
+ _C.MODEL.DYDETR.NUM_CLS = 1
578
+ _C.MODEL.DYDETR.NUM_REG = 3
579
+ _C.MODEL.DYDETR.NUM_HEADS = 6
580
+ _C.MODEL.DYDETR.NUM_CLASSES = 81
581
+ _C.MODEL.DYDETR.NUM_PROPOSALS = 300
582
+
583
+ # Dynamic Conv.
584
+ _C.MODEL.DYDETR.NUM_DYNAMIC = 2
585
+ _C.MODEL.DYDETR.DIM_DYNAMIC = 64
586
+
587
+ # Loss.
588
+ _C.MODEL.DYDETR.CLASS_WEIGHT = 2.0
589
+ _C.MODEL.DYDETR.GIOU_WEIGHT = 2.0
590
+ _C.MODEL.DYDETR.L1_WEIGHT = 5.0
591
+ _C.MODEL.DYDETR.DEEP_SUPERVISION = True
592
+ _C.MODEL.DYDETR.NO_OBJECT_WEIGHT = 0.1
593
+
594
+ # Focal Loss.
595
+ _C.MODEL.DYDETR.USE_FOCAL = True
596
+ _C.MODEL.DYDETR.ALPHA = 0.25
597
+ _C.MODEL.DYDETR.GAMMA = 2.0
598
+ _C.MODEL.DYDETR.PRIOR_PROB = 0.01
599
+
600
+ _C.MODEL.DYDETR.APPEND_BOX = False
601
+
602
+ # GROUNDING RELATED
603
+ _C.MODEL.DYDETR.INCLUDE_LANGUAGE_DECODER = False
604
+ _C.MODEL.DYDETR.USE_DOT_PRODUCT_TOKEN_LOSS = False
605
+ _C.MODEL.DYDETR.LOG_SCALE = 0.0 # temperature
606
+ _C.MODEL.DYDETR.RESET_PARAMETERS = True
607
+ _C.MODEL.DYDETR.USE_GROUNDING_MATCHER_SETCRITERION = False
608
+ _C.MODEL.DYDETR.MDETR_PLAIN_INFERENCE = False
609
+ _C.MODEL.DYDETR.OVERRIDE_LANGUAGE_MODEL_FOR_TOKEN_LOSS = False
610
+ _C.MODEL.DYDETR.NORMALIZE_PER_BOX = False
611
+ _C.MODEL.DYDETR.RESET_SKIP_DOT_PRODUCT_WEIGHTS = False
612
+ _C.MODEL.DYDETR.DEBUG = False
613
+ _C.MODEL.DYDETR.AGGREGATE_METHOD = "MEAN"
614
+ _C.MODEL.DYDETR.EARLY_FUSE_ON = False
615
+ _C.MODEL.DYDETR.DYTOWER_ON = False
616
+ _C.MODEL.DYDETR.USE_FUSED_LANGUAGE_FEATURES = True
617
+ # ---------------------------------------------------------------------------- #
618
+ # RPN options
619
+ # ---------------------------------------------------------------------------- #
620
+ _C.MODEL.RPN = CN()
621
+ _C.MODEL.RPN.USE_FPN = False
622
+ # Base RPN anchor sizes given in absolute pixels w.r.t. the scaled network input
623
+ _C.MODEL.RPN.ANCHOR_SIZES = (32, 64, 128, 256, 512)
624
+ # Stride of the feature map that RPN is attached.
625
+ # For FPN, number of strides should match number of scales
626
+ _C.MODEL.RPN.ANCHOR_STRIDE = (16,)
627
+ # RPN anchor aspect ratios
628
+ _C.MODEL.RPN.ASPECT_RATIOS = (0.5, 1.0, 2.0)
629
+ # Anchor shift away ration from the center for r,t,l,d
630
+ _C.MODEL.RPN.ANCHOR_SHIFT = (0.0, 0.0, 0.0, 0.0)
631
+ # Use center to decide anchor size
632
+ _C.MODEL.RPN.USE_RELATIVE_SIZE = False
633
+ # Remove RPN anchors that go outside the image by RPN_STRADDLE_THRESH pixels
634
+ # Set to -1 or a large value, e.g. 100000, to disable pruning anchors
635
+ _C.MODEL.RPN.STRADDLE_THRESH = 0
636
+ # Anchor scales per octave for complex anchors
637
+ _C.MODEL.RPN.OCTAVE = 2.0
638
+ _C.MODEL.RPN.SCALES_PER_OCTAVE = 3
639
+ # Minimum overlap required between an anchor and ground-truth box for the
640
+ # (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD
641
+ # ==> positive RPN example)
642
+ _C.MODEL.RPN.FG_IOU_THRESHOLD = 0.7
643
+ # Maximum overlap allowed between an anchor and ground-truth box for the
644
+ # (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD
645
+ # ==> negative RPN example)
646
+ _C.MODEL.RPN.BG_IOU_THRESHOLD = 0.3
647
+ # Total number of RPN examples per image
648
+ _C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256
649
+ # Target fraction of foreground (positive) examples per RPN minibatch
650
+ _C.MODEL.RPN.POSITIVE_FRACTION = 0.5
651
+ # Number of top scoring RPN proposals to keep before applying NMS
652
+ # When FPN is used, this is *per FPN level* (not total)
653
+ _C.MODEL.RPN.PRE_NMS_TOP_N_TRAIN = 12000
654
+ _C.MODEL.RPN.PRE_NMS_TOP_N_TEST = 6000
655
+ # Number of top scoring RPN proposals to keep after applying NMS
656
+ _C.MODEL.RPN.POST_NMS_TOP_N_TRAIN = 2000
657
+ _C.MODEL.RPN.POST_NMS_TOP_N_TEST = 1000
658
+ # NMS threshold used on RPN proposals
659
+ _C.MODEL.RPN.NMS_THRESH = 0.7
660
+ # Proposal height and width both need to be greater than RPN_MIN_SIZE
661
+ # (a the scale used during training or inference)
662
+ _C.MODEL.RPN.MIN_SIZE = 0
663
+ # Number of top scoring RPN proposals to keep after combining proposals from
664
+ # all FPN levels
665
+ _C.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN = 2000
666
+ _C.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 2000
667
+ # Custom rpn head, empty to use default conv or separable conv
668
+ _C.MODEL.RPN.RPN_HEAD = "SingleConvRPNHead"
669
+ _C.MODEL.RPN.FREEZE = False
670
+ _C.MODEL.RPN.FORCE_BOXES = False
671
+ _C.MODEL.RPN.RETURN_FUSED_FEATURES = False
672
+
673
+ # ---------------------------------------------------------------------------- #
674
+ # ROI HEADS options
675
+ # ---------------------------------------------------------------------------- #
676
+ _C.MODEL.ROI_HEADS = CN()
677
+ _C.MODEL.ROI_HEADS.USE_FPN = False
678
+ # Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD)
679
+ _C.MODEL.ROI_HEADS.FG_IOU_THRESHOLD = 0.5
680
+ # Overlap threshold for an RoI to be considered background
681
+ # (class = 0 if overlap in [0, BG_IOU_THRESHOLD))
682
+ _C.MODEL.ROI_HEADS.BG_IOU_THRESHOLD = 0.5
683
+ # Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets
684
+ # These are empirically chosen to approximately lead to unit variance targets
685
+ _C.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS = (10.0, 10.0, 5.0, 5.0)
686
+ # RoI minibatch size *per image* (number of regions of interest [ROIs])
687
+ # Total number of RoIs per training minibatch =
688
+ # TRAIN.BATCH_SIZE_PER_IM * TRAIN.IMS_PER_BATCH * NUM_GPUS
689
+ # E.g., a common configuration is: 512 * 2 * 8 = 8192
690
+ _C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512
691
+ # Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0)
692
+ _C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25
693
+
694
+ # Only used on test mode
695
+
696
+ # Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to
697
+ # balance obtaining high recall with not having too many low precision
698
+ # detections that will slow down inference post processing steps (like NMS)
699
+ _C.MODEL.ROI_HEADS.SCORE_THRESH = 0.05
700
+ # Overlap threshold used for non-maximum suppression (suppress boxes with
701
+ # IoU >= this threshold)
702
+ _C.MODEL.ROI_HEADS.NMS = 0.5
703
+ # Maximum number of detections to return per image (100 is based on the limit
704
+ # established for the COCO dataset)
705
+ _C.MODEL.ROI_HEADS.DETECTIONS_PER_IMG = 100
706
+
707
+ _C.MODEL.ROI_BOX_HEAD = CN()
708
+ _C.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor"
709
+ _C.MODEL.ROI_BOX_HEAD.PREDICTOR = "FastRCNNPredictor"
710
+ _C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14
711
+ _C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0
712
+ _C.MODEL.ROI_BOX_HEAD.POOLER_SCALES = (1.0 / 16,)
713
+ _C.MODEL.ROI_BOX_HEAD.NUM_CLASSES = 81
714
+ # Hidden layer dimension when using an MLP for the RoI box head
715
+ _C.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM = 1024
716
+ # GN
717
+ _C.MODEL.ROI_BOX_HEAD.USE_GN = False
718
+ # Dilation
719
+ _C.MODEL.ROI_BOX_HEAD.DILATION = 1
720
+ _C.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM = 256
721
+ _C.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS = 4
722
+ # Use D2 style ROIAlignV2
723
+ _C.MODEL.ROI_BOX_HEAD.POOLER_ALIGNED = False
724
+
725
+ _C.MODEL.ROI_MASK_HEAD = CN()
726
+ _C.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR = "ResNet50Conv5ROIFeatureExtractor"
727
+ _C.MODEL.ROI_MASK_HEAD.PREDICTOR = "MaskRCNNC4Predictor"
728
+ _C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14
729
+ _C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0
730
+ _C.MODEL.ROI_MASK_HEAD.POOLER_SCALES = (1.0 / 16,)
731
+ _C.MODEL.ROI_MASK_HEAD.MLP_HEAD_DIM = 1024
732
+ _C.MODEL.ROI_MASK_HEAD.CONV_LAYERS = (256, 256, 256, 256)
733
+ _C.MODEL.ROI_MASK_HEAD.RESOLUTION = 14
734
+ _C.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True
735
+ # Whether or not resize and translate masks to the input image.
736
+ _C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS = False
737
+ _C.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD = 0.5
738
+ # Dilation
739
+ _C.MODEL.ROI_MASK_HEAD.DILATION = 1
740
+ # GN
741
+ _C.MODEL.ROI_MASK_HEAD.USE_GN = False
742
+ # HG
743
+ _C.MODEL.ROI_MASK_HEAD.HG_SCALE = 1
744
+
745
+ _C.MODEL.ROI_KEYPOINT_HEAD = CN()
746
+ _C.MODEL.ROI_KEYPOINT_HEAD.FEATURE_EXTRACTOR = "KeypointRCNNFeatureExtractor"
747
+ _C.MODEL.ROI_KEYPOINT_HEAD.PREDICTOR = "KeypointRCNNPredictor"
748
+ _C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14
749
+ _C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0
750
+ _C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SCALES = (1.0 / 16,)
751
+ _C.MODEL.ROI_KEYPOINT_HEAD.MLP_HEAD_DIM = 1024
752
+ _C.MODEL.ROI_KEYPOINT_HEAD.CONV_LAYERS = tuple(512 for _ in range(8))
753
+ _C.MODEL.ROI_KEYPOINT_HEAD.RESOLUTION = 14
754
+ _C.MODEL.ROI_KEYPOINT_HEAD.NUM_CLASSES = 17
755
+ _C.MODEL.ROI_KEYPOINT_HEAD.KEYPOINT_NAME = () # If left empty, use default names
756
+ _C.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR = True
757
+
758
+ # ---------------------------------------------------------------------------- #
759
+ # ResNe[X]t options (ResNets = {ResNet, ResNeXt}
760
+ # Note that parts of a resnet may be used for both the backbone and the head
761
+ # These options apply to both
762
+ # ---------------------------------------------------------------------------- #
763
+ _C.MODEL.RESNETS = CN()
764
+
765
+ _C.MODEL.RESNETS.USE_STEM3X3 = False
766
+ _C.MODEL.RESNETS.WITH_SE = False
767
+ _C.MODEL.RESNETS.USE_AVG_DOWN = False
768
+
769
+ # Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt
770
+ _C.MODEL.RESNETS.NUM_GROUPS = 1
771
+
772
+ # Baseline width of each group
773
+ _C.MODEL.RESNETS.WIDTH_PER_GROUP = 64
774
+
775
+ # Place the stride 2 conv on the 1x1 filter
776
+ # Use True only for the original MSRA ResNet; use False for C2 and Torch models
777
+ _C.MODEL.RESNETS.STRIDE_IN_1X1 = True
778
+
779
+ # Residual transformation function
780
+ _C.MODEL.RESNETS.TRANS_FUNC = "BottleneckWithFixedBatchNorm"
781
+ # ResNet's stem function (conv1 and pool1)
782
+ _C.MODEL.RESNETS.STEM_FUNC = "StemWithFixedBatchNorm"
783
+
784
+ # Apply dilation in stage "res5"
785
+ _C.MODEL.RESNETS.RES5_DILATION = 1
786
+
787
+ _C.MODEL.RESNETS.BACKBONE_OUT_CHANNELS = 256 * 4
788
+ _C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
789
+ _C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64
790
+
791
+ _C.MODEL.RESNETS.REVISION = "resnet_light"
792
+ # Deformable convolutions
793
+ _C.MODEL.RESNETS.STAGE_WITH_DCN = (False, False, False, False)
794
+ _C.MODEL.RESNETS.WITH_MODULATED_DCN = False
795
+ _C.MODEL.RESNETS.DEFORMABLE_GROUPS = 1
796
+
797
+ # ---------------------------------------------------------------------------- #
798
+ # Swin Transformer
799
+ # ---------------------------------------------------------------------------- #
800
+ _C.MODEL.SWINT = CN()
801
+ _C.MODEL.SWINT.EMBED_DIM = 96
802
+ _C.MODEL.SWINT.OUT_CHANNELS = (96, 192, 384, 768)
803
+ _C.MODEL.SWINT.DEPTHS = (2, 2, 6, 2)
804
+ _C.MODEL.SWINT.NUM_HEADS = (3, 6, 12, 24)
805
+ _C.MODEL.SWINT.WINDOW_SIZE = 7
806
+ _C.MODEL.SWINT.MLP_RATIO = 4
807
+ _C.MODEL.SWINT.DROP_PATH_RATE = 0.2
808
+ _C.MODEL.SWINT.APE = False
809
+ _C.MODEL.SWINT.VERSION = "v1"
810
+ _C.MODEL.SWINT.OUT_NORM = True
811
+ _C.MODEL.SWINT.LAYER_SCALE = 0
812
+
813
+ # ---------------------------------------------------------------------------- #
814
+ # CVT SPEC
815
+ # ---------------------------------------------------------------------------- #
816
+ _C.MODEL.SPEC = CN(new_allowed=True)
817
+
818
+ # ---------------------------------------------------------------------------- #
819
+ # CLIP SPEC
820
+ # ---------------------------------------------------------------------------- #
821
+ _C.MODEL.CLIP = CN()
822
+ _C.MODEL.CLIP.CONTEXT_LENGTH = 256 # default 77
823
+ _C.MODEL.CLIP.WIDTH = 512
824
+ _C.MODEL.CLIP.LAYERS = 12
825
+ _C.MODEL.CLIP.HEADS = 8
826
+ _C.MODEL.CLIP.DROP_PATH = 0.0
827
+ _C.MODEL.CLIP.TOKENIZER = "clip"
828
+ _C.MODEL.CLIP.VOCAB_SIZE = 49408
829
+
830
+ # ---------------------------------------------------------------------------- #
831
+ # SEARCH
832
+ # ---------------------------------------------------------------------------- #
833
+
834
+ _C.SEARCH = CN()
835
+ _C.SEARCH.MAX_EPOCH = 20
836
+ _C.SEARCH.SELECT_NUM = 20
837
+ _C.SEARCH.POPULATION_NUM = 64
838
+ _C.SEARCH.MUTATION_NUM = 24
839
+ _C.SEARCH.CROSSOVER_NUM = 24
840
+ _C.SEARCH.MUTATION_PROB = 0.1
841
+
842
+ # ---------------------------------------------------------------------------- #
843
+ # Solver
844
+ # ---------------------------------------------------------------------------- #
845
+ _C.SOLVER = CN()
846
+ _C.SOLVER.USE_AMP = False
847
+
848
+ _C.SOLVER.MAX_ITER = 40000
849
+ _C.SOLVER.MULTI_MAX_ITER = () # set different max epoch for different stage
850
+ _C.SOLVER.MAX_EPOCH = 0 # any epoch number>0 will overwrite max_iter
851
+ _C.SOLVER.MULTI_MAX_EPOCH = () # set different max epoch for different stage
852
+
853
+ _C.SOLVER.OPTIMIZER = "SGD" # "ADAMW"
854
+
855
+ _C.SOLVER.BASE_LR = 0.001
856
+
857
+ _C.SOLVER.LANG_LR = 0.00001
858
+ _C.SOLVER.BACKBONE_BODY_LR_FACTOR = 1.0
859
+ _C.SOLVER.FUSION_LR_FACTOR = 1.0
860
+
861
+
862
+ _C.SOLVER.BIAS_LR_FACTOR = 2
863
+ _C.SOLVER.GRAD_CLIP = 0.0
864
+ # D2 gradient clip
865
+ _C.SOLVER.CLIP_GRADIENTS = CN()
866
+ _C.SOLVER.CLIP_GRADIENTS.ENABLED = False
867
+ _C.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 0.0
868
+ _C.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "full_model"
869
+ _C.SOLVER.CLIP_GRADIENTS.NORM_TYPE = 2.0
870
+ _C.SOLVER.MODEL_EMA = 0.0
871
+
872
+ _C.SOLVER.MOMENTUM = 0.9
873
+
874
+ _C.SOLVER.WEIGHT_DECAY = 0.0005
875
+ _C.SOLVER.WEIGHT_DECAY_BIAS = 0.0
876
+ _C.SOLVER.WEIGHT_DECAY_NORM_FACTOR = 1.0
877
+ _C.SOLVER.WEIGHT_DECAY_HEAD_FACTOR = 1.0
878
+
879
+ # use cosine lr to replace default multistage
880
+ _C.SOLVER.USE_COSINE = False
881
+ _C.SOLVER.MIN_LR = 0.000001
882
+
883
+ _C.SOLVER.GAMMA = 0.1
884
+ _C.SOLVER.STEPS = (30000,)
885
+
886
+ _C.SOLVER.USE_AUTOSTEP = False
887
+ _C.SOLVER.STEP_PATIENCE = 5
888
+
889
+ _C.SOLVER.WARMUP_FACTOR = 1.0 / 3
890
+ _C.SOLVER.WARMUP_ITERS = 500
891
+ _C.SOLVER.WARMUP_METHOD = "linear"
892
+
893
+ _C.SOLVER.CHECKPOINT_PERIOD = 2500
894
+ _C.SOLVER.CHECKPOINT_PER_EPOCH = -1.0
895
+ _C.SOLVER.TEST_WITH_INFERENCE = False
896
+ _C.SOLVER.AUTO_TERMINATE_PATIENCE = -1
897
+ # Number of images per batch
898
+ # This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
899
+ # see 2 images per batch
900
+ _C.SOLVER.IMS_PER_BATCH = 16
901
+ # This is the max negative ratio allowed per batch
902
+ _C.SOLVER.MAX_NEG_PER_BATCH = 0.1
903
+
904
+ _C.SOLVER.SEED = 0
905
+ _C.SOLVER.DISABLE_OUTPUT_DISTRIBUTED = False
906
+
907
+
908
+ _C.SOLVER.PROMPT_PROBING_LEVEL = -1.0
909
+ # -1 means tuning the whole model;
910
+ # 1 means tuning the whole language model; 1.5 means tuning the box head as well
911
+
912
+ _C.SOLVER.FIND_UNUSED_PARAMETERS = True
913
+ _C.SOLVER.DATASET_LENGTH = -1 # Just for logging purpose
914
+ _C.SOLVER.TUNING_HIGHLEVEL_OVERRIDE = None
915
+ _C.SOLVER.USE_EMA_FOR_MONITOR = False
916
+
917
+ _C.SOLVER.WEIGHT_DECAY_SCHEDULE = False
918
+ _C.SOLVER.WEIGHT_DECAY_SCHEDULE_RATIO = 0.667
919
+ _C.SOLVER.RESUME_SKIP_SCHEDULE = False # when we resume from a checkpoint, we can skip
920
+
921
+ # ---------------------------------------------------------------------------- #
922
+ # Specific test options
923
+ # ---------------------------------------------------------------------------- #
924
+ _C.TEST = CN()
925
+ _C.TEST.EXPECTED_RESULTS = []
926
+ _C.TEST.EXPECTED_RESULTS_SIGMA_TOL = 4
927
+ _C.TEST.DURING_TRAINING = False
928
+ # Number of images per batch
929
+ # This is global, so if we have 8 GPUs and IMS_PER_BATCH = 16, each GPU will
930
+ # see 2 images per batch
931
+ _C.TEST.IMS_PER_BATCH = 16
932
+ # Special Test Configuration
933
+ _C.TEST.USE_MULTISCALE = False
934
+ # _C.TEST.SCALES = (400, 600, 800, 1000, 1200, 1400)
935
+ # _C.TEST.RANGES = ((96, 10000), (64, 10000), (0, 10000), (0, 10000), (0, 256), (0, 192))
936
+ _C.TEST.SCALES = (400, 500, 600, 640, 700, 900, 1000, 1100, 1200, 1300, 1400, 1800)
937
+ _C.TEST.RANGES = (
938
+ (96, 10000),
939
+ (96, 10000),
940
+ (64, 10000),
941
+ (64, 10000),
942
+ (64, 10000),
943
+ (0, 10000),
944
+ (0, 10000),
945
+ (0, 256),
946
+ (0, 256),
947
+ (0, 192),
948
+ (0, 192),
949
+ (0, 96),
950
+ )
951
+ _C.TEST.MAX_SIZE = 2500
952
+ _C.TEST.FLIP = True
953
+ _C.TEST.SPECIAL_NMS = "none" # ('none', 'soft-nms', 'vote', 'soft-vote')
954
+ _C.TEST.TH = 0.6 # threshold for nms or vote
955
+ _C.TEST.PRE_NMS_TOP_N = 1000
956
+ _C.TEST.NUM_CLASSES = 81
957
+ _C.TEST.SELECT_CLASSES = ()
958
+
959
+ _C.TEST.EVAL_TASK = ""
960
+ _C.TEST.SUBSET = -1
961
+ _C.TEST.CHUNKED_EVALUATION = -1
962
+ _C.TEST.MDETR_STYLE_AGGREGATE_CLASS_NUM = -1
963
+ _C.TEST.CHUNK_METHOD = "random" # or similar
964
+ _C.TEST.CHUNK_INFERENCE_VERSION = "v1" # v2: modify the ATSS inference code slightly to make
965
+ # ---------------------------------------------------------------------------- #
966
+ # Misc options
967
+ # ---------------------------------------------------------------------------- #
968
+ _C.OUTPUT_DIR = "OUTPUT"
969
+
970
+ _C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py")
971
+
972
+ # TensorBoard experiment location
973
+ _C.TENSORBOARD_EXP = "OUTPUT"
974
+
975
+ _C.GLIPKNOW = CN()
976
+ _C.GLIPKNOW.KNOWLEDGE_FILE = ""
977
+ _C.GLIPKNOW.KNOWLEDGE_TYPE = ""
978
+ _C.GLIPKNOW.MAX_NUM_CLASSES_PER_BATCH_TRAIN = -1
979
+ _C.GLIPKNOW.PARALLEL_LANGUAGE_INPUT = False
980
+ _C.GLIPKNOW.LAN_FEATURE_AGG_TYPE = "first"
981
+ _C.GLIPKNOW.GPT3_NUM = 5
982
+ _C.GLIPKNOW.WIKI_AND_GPT3 = False
maskrcnn_benchmark/config/paths_catalog.py ADDED
@@ -0,0 +1,779 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ """Centralized catalog of paths."""
3
+
4
+ import os
5
+
6
+
7
+ def try_to_find(file, return_dir=False, search_path=["./DATASET", "./OUTPUT", "./data", "./MODEL"]):
8
+ if not file:
9
+ return file
10
+
11
+ if file.startswith("catalog://"):
12
+ return file
13
+
14
+ DATASET_PATH = ["./"]
15
+ if "DATASET" in os.environ:
16
+ DATASET_PATH.append(os.environ["DATASET"])
17
+ DATASET_PATH += search_path
18
+
19
+ for path in DATASET_PATH:
20
+ if os.path.exists(os.path.join(path, file)):
21
+ if return_dir:
22
+ return path
23
+ else:
24
+ return os.path.join(path, file)
25
+
26
+ print("Cannot find {} in {}".format(file, DATASET_PATH))
27
+ exit(1)
28
+
29
+
30
+ class DatasetCatalog(object):
31
+ DATASETS = {
32
+ # pretrained grounding dataset
33
+ # mixed vg and coco
34
+ "mixed_train": {
35
+ "coco_img_dir": "coco/train2014",
36
+ "vg_img_dir": "gqa/images",
37
+ "ann_file": "mdetr_annotations/final_mixed_train.json",
38
+ },
39
+ "mixed_train_no_coco": {
40
+ "coco_img_dir": "coco/train2014",
41
+ "vg_img_dir": "gqa/images",
42
+ "ann_file": "mdetr_annotations/final_mixed_train_no_coco.json",
43
+ },
44
+ # flickr30k
45
+ "flickr30k_train": {
46
+ "img_folder": "flickr30k/flickr30k_images/train",
47
+ "ann_file": "mdetr_annotations/final_flickr_separateGT_train.json",
48
+ "is_train": True,
49
+ },
50
+ "flickr30k_val": {
51
+ "img_folder": "flickr30k/flickr30k_images/val",
52
+ "ann_file": "mdetr_annotations/final_flickr_separateGT_val.json",
53
+ "is_train": False,
54
+ },
55
+ "flickr30k_test": {
56
+ "img_folder": "flickr30k/flickr30k_images/test",
57
+ "ann_file": "mdetr_annotations/final_flickr_separateGT_test.json",
58
+ "is_train": False,
59
+ },
60
+ # refcoco
61
+ "refexp_all_val": {
62
+ "img_dir": "coco/train2014",
63
+ "ann_file": "mdetr_annotations/final_refexp_val.json",
64
+ "is_train": False,
65
+ },
66
+ "refcoco_train": {
67
+ "img_dir": "coco/train2014",
68
+ "ann_file": "mdetr_annotations/finetune_refcoco_train.json",
69
+ "is_train": True,
70
+ },
71
+ "refcoco_val": {
72
+ "img_dir": "coco/train2014",
73
+ "ann_file": "mdetr_annotations/finetune_refcoco_val.json",
74
+ "is_train": False,
75
+ },
76
+ "refcoco_real_val": {
77
+ "img_dir": "coco/train2014",
78
+ "ann_file": "mdetr_annotations/finetune_refcoco_val.json",
79
+ "is_train": False,
80
+ },
81
+ "refcoco_testA": {
82
+ "img_dir": "coco/train2014",
83
+ "ann_file": "mdetr_annotations/finetune_refcoco_testA.json",
84
+ "is_train": False,
85
+ },
86
+ "refcoco_testB": {
87
+ "img_dir": "coco/train2014",
88
+ "ann_file": "mdetr_annotations/finetune_refcoco_testB.json",
89
+ "is_train": False,
90
+ },
91
+ "refcoco+_train": {
92
+ "img_dir": "coco/train2014",
93
+ "ann_file": "mdetr_annotations/finetune_refcoco+_train.json",
94
+ "is_train": True,
95
+ },
96
+ "refcoco+_val": {
97
+ "img_dir": "coco/train2014",
98
+ "ann_file": "mdetr_annotations/finetune_refcoco+_val.json",
99
+ "is_train": False,
100
+ },
101
+ "refcoco+_testA": {
102
+ "img_dir": "coco/train2014",
103
+ "ann_file": "mdetr_annotations/finetune_refcoco+_testA.json",
104
+ "is_train": False,
105
+ },
106
+ "refcoco+_testB": {
107
+ "img_dir": "coco/train2014",
108
+ "ann_file": "mdetr_annotations/finetune_refcoco+_testB.json",
109
+ "is_train": False,
110
+ },
111
+ "refcocog_train": {
112
+ "img_dir": "coco/train2014",
113
+ "ann_file": "mdetr_annotations/finetune_refcocog_train.json",
114
+ "is_train": True,
115
+ },
116
+ "refcocog_val": {
117
+ "img_dir": "coco/train2014",
118
+ "ann_file": "mdetr_annotations/finetune_refcocog_val.json",
119
+ "is_train": False,
120
+ },
121
+ "refcocog_test": {
122
+ "img_dir": "coco/train2014",
123
+ "ann_file": "mdetr_annotations/finetune_refcocog_test_corrected.json",
124
+ "is_train": False,
125
+ },
126
+ # gqa
127
+ "gqa_val": {"img_dir": "gqa/images", "ann_file": "mdetr_annotations/final_gqa_val.json", "is_train": False},
128
+ # phrasecut
129
+ "phrasecut_train": {
130
+ "img_dir": "gqa/images",
131
+ "ann_file": "mdetr_annotations/finetune_phrasecut_train.json",
132
+ "is_train": True,
133
+ },
134
+ # caption
135
+ "bing_caption_train": {
136
+ "yaml_path": "BingData/predict_yaml",
137
+ "yaml_name": "dreamstime_com_dyhead_objvg_e39",
138
+ "yaml_name_no_coco": "dreamstime_com_Detection_Pretrain_NoCOCO_Packed125",
139
+ "is_train": True,
140
+ },
141
+ # od to grounding
142
+ # coco tsv
143
+ "coco_dt_train": {
144
+ "dataset_file": "coco_dt",
145
+ "yaml_path": "coco_tsv/coco_obj.yaml",
146
+ "is_train": True,
147
+ },
148
+ "COCO_odinw_train_8copy_dt_train": {
149
+ "dataset_file": "coco_odinw_dt",
150
+ "yaml_path": "coco_tsv/COCO_odinw_train_8copy.yaml",
151
+ "is_train": True,
152
+ },
153
+ "COCO_odinw_val_dt_train": {
154
+ "dataset_file": "coco_odinw_dt",
155
+ "yaml_path": "coco_tsv/COCO_odinw_val.yaml",
156
+ "is_train": False,
157
+ },
158
+ # lvis tsv
159
+ "lvisv1_dt_train": {
160
+ "dataset_file": "lvisv1_dt",
161
+ "yaml_path": "coco_tsv/LVIS_v1_train.yaml",
162
+ "is_train": True,
163
+ },
164
+ "LVIS_odinw_train_8copy_dt_train": {
165
+ "dataset_file": "coco_odinw_dt",
166
+ "yaml_path": "coco_tsv/LVIS_odinw_train_8copy.yaml",
167
+ "is_train": True,
168
+ },
169
+ # object365 tsv
170
+ "object365_dt_train": {
171
+ "dataset_file": "object365_dt",
172
+ "yaml_path": "Objects365/objects365_train_vgoiv6.cas2000.yaml",
173
+ "is_train": True,
174
+ },
175
+ "object365_odinw_2copy_dt_train": {
176
+ "dataset_file": "object365_odinw_dt",
177
+ "yaml_path": "Objects365/objects365_train_odinw.cas2000_2copy.yaml",
178
+ "is_train": True,
179
+ },
180
+ "objects365_odtsv_train": {
181
+ "dataset_file": "objects365_odtsv",
182
+ "yaml_path": "Objects365/train.cas2000.yaml",
183
+ "is_train": True,
184
+ },
185
+ "objects365_odtsv_val": {
186
+ "dataset_file": "objects365_odtsv",
187
+ "yaml_path": "Objects365/val.yaml",
188
+ "is_train": False,
189
+ },
190
+ # ImagetNet OD
191
+ "imagenetod_train_odinw_2copy_dt": {
192
+ "dataset_file": "imagenetod_odinw_dt",
193
+ "yaml_path": "imagenet_od/imagenetod_train_odinw_2copy.yaml",
194
+ "is_train": True,
195
+ },
196
+ # OpenImage OD
197
+ "oi_train_odinw_dt": {
198
+ "dataset_file": "oi_odinw_dt",
199
+ "yaml_path": "openimages_v5c/oi_train_odinw.cas.2000.yaml",
200
+ "is_train": True,
201
+ },
202
+ # vg tsv
203
+ "vg_dt_train": {
204
+ "dataset_file": "vg_dt",
205
+ "yaml_path": "visualgenome/train_vgoi6_clipped.yaml",
206
+ "is_train": True,
207
+ },
208
+ "vg_odinw_clipped_8copy_dt_train": {
209
+ "dataset_file": "vg_odinw_clipped_8copy_dt",
210
+ "yaml_path": "visualgenome/train_odinw_clipped_8copy.yaml",
211
+ "is_train": True,
212
+ },
213
+ "vg_vgoi6_clipped_8copy_dt_train": {
214
+ "dataset_file": "vg_vgoi6_clipped_8copy_dt",
215
+ "yaml_path": "visualgenome/train_vgoi6_clipped_8copy.yaml",
216
+ "is_train": True,
217
+ },
218
+ # coco json
219
+ "coco_grounding_train": {
220
+ "img_dir": "coco/train2017",
221
+ "ann_file": "coco/annotations/instances_train2017.json",
222
+ "is_train": True,
223
+ },
224
+ "lvis_grounding_train": {"img_dir": "coco", "ann_file": "coco/annotations/lvis_od_train.json"},
225
+
226
+ "lvis_evaluation_val": {
227
+ "img_dir": "lvis/coco2017",
228
+ "ann_file": "lvis/lvis_v1_minival_inserted_image_name.json",
229
+ "is_train": False,
230
+ },
231
+
232
+ "lvis_val": {
233
+ "img_dir": "coco",
234
+ "ann_file": "coco/annotations/lvis_od_val.json"},
235
+
236
+
237
+ # legacy detection dataset
238
+ "hsd_v001": {"img_dir": "hsd/20170901_Detection_HeadShoulder.V001/RawImages", "ann_file": "hsd/HSD_V001.json"},
239
+ "hsd_hddb": {"img_dir": "hddb/Images", "ann_file": "hddb/HDDB.json"},
240
+ "opencoco_train": {"img_dir": "openimages/train", "ann_file": "openimages/opencoco_train.json"},
241
+ "opencoco_val": {"img_dir": "openimages/val", "ann_file": "openimages/opencoco_val.json"},
242
+ "opencoco_test": {"img_dir": "openimages/test", "ann_file": "openimages/opencoco_test.json"},
243
+ "openhuman_train": {"img_dir": "openimages/train", "ann_file": "openimages/openhuman_train.json"},
244
+ "openhuman_val": {"img_dir": "openimages/val", "ann_file": "openimages/openhuman_val.json"},
245
+ "openhuman_test": {"img_dir": "openimages/test", "ann_file": "openimages/openhuman_test.json"},
246
+ "opencrowd_train": {"img_dir": "openimages/train", "ann_file": "openimages/opencrowd_train.json"},
247
+ "opencrowd_val": {"img_dir": "openimages/val", "ann_file": "openimages/opencrowd_val.json"},
248
+ "opencrowd_test": {"img_dir": "openimages/test", "ann_file": "openimages/opencrowd_test.json"},
249
+ "opencar_train": {"img_dir": "openimages/train", "ann_file": "openimages/opencar_train.json"},
250
+ "opencar_val": {"img_dir": "openimages/val", "ann_file": "openimages/opencar_val.json"},
251
+ "opencar_test": {"img_dir": "openimages/test", "ann_file": "openimages/opencar_test.json"},
252
+ "openhumancar_train": {"img_dir": "openimages/train", "ann_file": "openimages/openhumancar_train.json"},
253
+ "openhumancar_val": {"img_dir": "openimages/val", "ann_file": "openimages/openhumancar_val.json"},
254
+ "openhuamncar_test": {"img_dir": "openimages/test", "ann_file": "openimages/openhumancar_test.json"},
255
+ "open500_train": {
256
+ "img_dir": "openimages/train",
257
+ "ann_file": "openimages/openimages_challenge_2019_train_bbox.json",
258
+ },
259
+ "open500_val": {
260
+ "img_dir": "openimages/val",
261
+ "ann_file": "openimages/openimages_challenge_2019_val_bbox.json",
262
+ },
263
+ "openproposal_test": {
264
+ "img_dir": "openimages/test2019",
265
+ "ann_file": "openimages/proposals_test.json",
266
+ },
267
+ "object365_train": {"img_dir": "object365/train", "ann_file": "object365/objects365_train.json"},
268
+ "object365_val": {"img_dir": "object365/val", "ann_file": "object365/objects365_val.json"},
269
+ "lvis_train": {"img_dir": "coco", "ann_file": "coco/annotations/lvis_od_train.json"},
270
+ "lvis_val": {"img_dir": "coco", "ann_file": "coco/annotations/lvis_od_val.json"},
271
+ "image200_train": {"img_dir": "imagenet-od/Data/DET/train", "ann_file": "imagenet-od/im200_train.json"},
272
+ "image200_val": {"img_dir": "imagenet-od/Data/DET/val", "ann_file": "imagenet-od/im200_val.json"},
273
+ "coco_2017_train": {"img_dir": "coco/train2017", "ann_file": "coco/annotations/instances_train2017.json"},
274
+ "coco_2017_val": {"img_dir": "coco/val2017", "ann_file": "coco/annotations/instances_val2017.json"},
275
+ "coco_2017_test": {"img_dir": "coco/test2017", "ann_file": "coco/annotations/image_info_test-dev2017.json"},
276
+ "coco10_train": {"img_dir": "coco/train2017", "ann_file": "coco/annotations/instances_minitrain2017.json"},
277
+ "coco_2014_train": {"img_dir": "coco/train2014", "ann_file": "coco/annotations/instances_train2014.json"},
278
+ "coco_2014_val": {"img_dir": "coco/val2014", "ann_file": "coco/annotations/instances_val2014.json"},
279
+ "coco_2014_minival": {"img_dir": "coco/val2014", "ann_file": "coco/annotations/instances_minival2014.json"},
280
+ "coco_2014_valminusminival": {
281
+ "img_dir": "coco/val2014",
282
+ "ann_file": "coco/annotations/instances_valminusminival2014.json",
283
+ },
284
+ "coco_2014_train_partial": {
285
+ "img_dir": "coco/train2014",
286
+ "ann_file": "coco/annotations/partial0.2_train2014.json",
287
+ },
288
+ "coco_2014_valminusminival_partial": {
289
+ "img_dir": "coco/val2014",
290
+ "ann_file": "coco/annotations/partial0.2_valminusminival2014.json",
291
+ },
292
+ "coco_2014_train_few100": {"img_dir": "coco/train2014", "ann_file": "coco/annotations/few100_train2014.json"},
293
+ "coco_2014_train_few300": {"img_dir": "coco/train2014", "ann_file": "coco/annotations/few300_train2014.json"},
294
+ "coco_human_2014_train": {"img_dir": "coco/train2014", "ann_file": "coco/annotations/humans_train2014.json"},
295
+ "coco_human_2014_minival": {"img_dir": "coco/val2014", "ann_file": "coco/annotations/humans_minival2014.json"},
296
+ "coco_human_2014_valminusminival": {
297
+ "img_dir": "coco/val2014",
298
+ "ann_file": "coco/annotations/humans_valminusminival2014.json",
299
+ },
300
+ "coco_car_2014_train": {"img_dir": "coco/train2014", "ann_file": "coco/annotations/car_train2014.json"},
301
+ "coco_car_2014_minival": {"img_dir": "coco/val2014", "ann_file": "coco/annotations/car_minival2014.json"},
302
+ "coco_car_2014_valminusminival": {
303
+ "img_dir": "coco/val2014",
304
+ "ann_file": "coco/annotations/car_valminusminival2014.json",
305
+ },
306
+ "coco_humancar_2014_train": {
307
+ "img_dir": "coco/train2014",
308
+ "ann_file": "coco/annotations/humancar_train2014.json",
309
+ },
310
+ "coco_humancar_2014_minival": {
311
+ "img_dir": "coco/val2014",
312
+ "ann_file": "coco/annotations/humancar_minival2014.json",
313
+ },
314
+ "coco_humancar_2014_valminusminival": {
315
+ "img_dir": "coco/val2014",
316
+ "ann_file": "coco/annotations/humancar_valminusminival2014.json",
317
+ },
318
+ "coco_keypoint_2017_train": {
319
+ "img_dir": "coco/train2017",
320
+ "ann_file": "coco/annotations/person_keypoints_train2017.json",
321
+ },
322
+ "coco_keypoint_2017_val": {
323
+ "img_dir": "coco/val2017",
324
+ "ann_file": "coco/annotations/person_keypoints_val2017.json",
325
+ },
326
+ "coco_headshoulder_2017_train": {
327
+ "img_dir": "coco/train2017",
328
+ "ann_file": "coco/annotations/headshoulder_train2017.json",
329
+ },
330
+ "coco_headshoulder_2017_val": {
331
+ "img_dir": "coco/val2017",
332
+ "ann_file": "coco/annotations/headshoulder_val2017.json",
333
+ },
334
+ "coco_hskeypoint_2017_train": {
335
+ "img_dir": "coco/train2017",
336
+ "ann_file": "coco/annotations/person_hskeypoints_train2017.json",
337
+ },
338
+ "coco_hskeypoint_2017_val": {
339
+ "img_dir": "coco/val2017",
340
+ "ann_file": "coco/annotations/person_hskeypoints_val2017.json",
341
+ },
342
+ "voc_2007_train": {"data_dir": "voc/VOC2007", "split": "train"},
343
+ "voc_2007_train_cocostyle": {
344
+ "img_dir": "voc/VOC2007/JPEGImages",
345
+ "ann_file": "voc/VOC2007/Annotations/pascal_train2007.json",
346
+ },
347
+ "voc_2007_val": {"data_dir": "voc/VOC2007", "split": "val"},
348
+ "voc_2007_val_cocostyle": {
349
+ "img_dir": "voc/VOC2007/JPEGImages",
350
+ "ann_file": "voc/VOC2007/Annotations/pascal_val2007.json",
351
+ },
352
+ "voc_2007_test": {"data_dir": "voc/VOC2007", "split": "test"},
353
+ "voc_2007_test_cocostyle": {
354
+ "img_dir": "voc/VOC2007/JPEGImages",
355
+ "ann_file": "voc/VOC2007/Annotations/pascal_test2007.json",
356
+ },
357
+ "voc_2012_train": {"data_dir": "voc/VOC2012", "split": "train"},
358
+ "voc_2012_train_cocostyle": {
359
+ "img_dir": "voc/VOC2012/JPEGImages",
360
+ "ann_file": "voc/VOC2012/Annotations/pascal_train2012.json",
361
+ },
362
+ "voc_2012_val": {"data_dir": "voc/VOC2012", "split": "val"},
363
+ "voc_2012_val_cocostyle": {
364
+ "img_dir": "voc/VOC2012/JPEGImages",
365
+ "ann_file": "voc/VOC2012/Annotations/pascal_val2012.json",
366
+ },
367
+ "voc_2012_test": {
368
+ "data_dir": "voc/VOC2012",
369
+ "split": "test"
370
+ # PASCAL VOC2012 doesn't made the test annotations available, so there's no json annotation
371
+ },
372
+ "cityscapes_fine_instanceonly_seg_train_cocostyle": {
373
+ "img_dir": "cityscapes/images",
374
+ "ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_train.json",
375
+ },
376
+ "cityscapes_fine_instanceonly_seg_val_cocostyle": {
377
+ "img_dir": "cityscapes/images",
378
+ "ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_val.json",
379
+ },
380
+ "cityscapes_fine_instanceonly_seg_test_cocostyle": {
381
+ "img_dir": "cityscapes/images",
382
+ "ann_file": "cityscapes/annotations/instancesonly_filtered_gtFine_test.json",
383
+ },
384
+ "crowdhuman_train": {"img_dir": "CrowdHuman/Images", "ann_file": "CrowdHuman/crowdhuman_train.json"},
385
+ "crowdhuman_val": {"img_dir": "CrowdHuman/Images", "ann_file": "CrowdHuman/crowdhuman_val.json"},
386
+ "crowdhead_train": {"img_dir": "CrowdHuman/Images", "ann_file": "CrowdHuman/crowdhead_train.json"},
387
+ "crowdhead_val": {"img_dir": "CrowdHuman/Images", "ann_file": "CrowdHuman/crowdhead_val.json"},
388
+ "crowdfull_train": {"img_dir": "CrowdHuman/Images", "ann_file": "CrowdHuman/crowdfull_train.json"},
389
+ "crowdfull_val": {"img_dir": "CrowdHuman/Images", "ann_file": "CrowdHuman/crowdfull_val.json"},
390
+ "ternium_train": {"img_dir": "ternium/images", "ann_file": "ternium/train_annotation.json"},
391
+ "ternium_val": {"img_dir": "ternium/images", "ann_file": "ternium/val_annotation.json"},
392
+ "ternium_test": {"img_dir": "ternium/images", "ann_file": "ternium/test_annotation.json"},
393
+ "ternium_test_crop": {"img_dir": "ternium/test_motion_crop", "ann_file": "ternium/test_motion_crop.json"},
394
+ "ternium_train_aug": {"img_dir": "ternium/train_crop_aug", "ann_file": "ternium/train_crop_aug.json"},
395
+ "ternium_test_aug": {"img_dir": "ternium/test_crop_aug", "ann_file": "ternium/test_motion_crop_aug.json"},
396
+ "ternium_vh_train": {
397
+ "img_dir": "ternium-vehicle/train_dataset_coco/images",
398
+ "ann_file": "ternium-vehicle/train_dataset_coco/coco_annotation.json",
399
+ },
400
+ "ternium_vh_val": {
401
+ "img_dir": "ternium-vehicle/validation_dataset_coco/images",
402
+ "ann_file": "ternium-vehicle/validation_dataset_coco/coco_annotation.json",
403
+ },
404
+ "msra_traffic": {"img_dir": "msra-traffic/Images", "ann_file": "msra-traffic/annotation.json"},
405
+ "msra_traffic_car": {"img_dir": "msra-traffic/Images", "ann_file": "msra-traffic/car_annotation.json"},
406
+ "msra_traffic_humancar": {
407
+ "img_dir": "msra-traffic/Images",
408
+ "ann_file": "msra-traffic/humancar_annotation.json",
409
+ },
410
+ "jigsaw_car_train": {"img_dir": "jigsaw", "ann_file": "jigsaw/train.json"},
411
+ "jigsaw_car_val": {"img_dir": "jigsaw", "ann_file": "jigsaw/val.json"},
412
+ "miotcd_train": {"img_dir": "MIO-TCD/MIO-TCD-Localization", "ann_file": "MIO-TCD/train.json"},
413
+ "miotcd_val": {"img_dir": "MIO-TCD/MIO-TCD-Localization", "ann_file": "MIO-TCD/val.json"},
414
+ "detrac_train": {"img_dir": "detrac/Insight-MVT_Annotation_Train", "ann_file": "detrac/train.json"},
415
+ "detrac_val": {"img_dir": "detrac/Insight-MVT_Annotation_Train", "ann_file": "detrac/val.json"},
416
+ "mrw": {"img_dir": "mrw/clips", "ann_file": "mrw/annotations.json"},
417
+ "mrw_bg": {"img_dir": "mrw/bg", "ann_file": "mrw/bg_annotations.json"},
418
+ "webmarket_bg": {"img_dir": "webmarket", "ann_file": "webmarket/bg_annotations.json"},
419
+ "mot17_train": {"img_dir": "mot/MOT17Det", "ann_file": "mot/MOT17Det/train.json"},
420
+ "egohands": {"img_dir": "egohands/images", "ann_file": "egohands/egohands.json"},
421
+ "hof": {"img_dir": "hof/images_original_size", "ann_file": "hof/train.json"},
422
+ "vlmhof": {"img_dir": "vlmhof/RGB", "ann_file": "vlmhof/train.json"},
423
+ "vgghands_train": {"img_dir": "vgghands/training_dataset", "ann_file": "vgghands/training.json"},
424
+ "vgghands_val": {"img_dir": "vgghands/validation_dataset", "ann_file": "vgghands/validation.json"},
425
+ "vgghands_test": {"img_dir": "vgghands/test_dataset", "ann_file": "vgghands/test.json"},
426
+ "od:coco_train": {"img_dir": "coco/train2017", "ann_file": "coco/annotations/od_train2017.json"},
427
+ "od:coco_val": {"img_dir": "coco/val2017", "ann_file": "coco/annotations/od_val2017.json"},
428
+ "od:lvis_train": {"img_dir": "coco", "ann_file": "coco/annotations/od_train-lvis.json"},
429
+ "od:lvis_val": {"img_dir": "coco", "ann_file": "coco/annotations/od_val-lvis.json"},
430
+ "od:o365_train": {"img_dir": "object365/train", "ann_file": "object365/od_train.json"},
431
+ "od:o365_val": {"img_dir": "object365/val", "ann_file": "object365/od_val.json"},
432
+ "od:oi500_train": {
433
+ "img_dir": "openimages/train",
434
+ "ann_file": "openimages/od_train2019.json",
435
+ "paste_dir": "openimages/panoptic_train_challenge_2019",
436
+ "paste_file": "openimages/panoptic_train2019.json",
437
+ },
438
+ "od:oi500_val": {
439
+ "img_dir": "openimages/val",
440
+ "ann_file": "openimages/od_val2019.json",
441
+ "paste_dir": "openimages/panoptic_val_challenge_2019",
442
+ "paste_file": "openimages/panoptic_val2019.json",
443
+ },
444
+ "od:im200_train": {"img_dir": "imagenet-od/Data/DET/train", "ann_file": "imagenet-od/train.json"},
445
+ "od:im200_val": {"img_dir": "imagenet-od/Data/DET/val", "ann_file": "imagenet-od/val.json"},
446
+ "cv:animal661_train": {"img_dir": "cvtasks/animal-661/images", "ann_file": "cvtasks/animal-661/train.json"},
447
+ "cv:animal661_test": {"img_dir": "cvtasks/animal-661/images", "ann_file": "cvtasks/animal-661/test.json"},
448
+ "cv:seeingai_train": {"img_dir": "cvtasks/SeeingAI/train.tsv", "ann_file": "cvtasks/SeeingAI/train.json"},
449
+ "cv:seeingai_test": {"img_dir": "cvtasks/SeeingAI/test.tsv", "ann_file": "cvtasks/SeeingAI/test.json"},
450
+ "cv:office_train": {
451
+ "img_dir": "cvtasks/Ping-Office-Env/train.tsv",
452
+ "ann_file": "cvtasks/Ping-Office-Env/train.json",
453
+ },
454
+ "cv:office_test": {
455
+ "img_dir": "cvtasks/Ping-Office-Env/test.tsv",
456
+ "ann_file": "cvtasks/Ping-Office-Env/test.json",
457
+ },
458
+ "cv:logo_train": {"img_dir": "cvtasks/Ping-Logo", "ann_file": "cvtasks/Ping-Logo/train.json"},
459
+ "cv:logo_test": {"img_dir": "cvtasks/Ping-Logo", "ann_file": "cvtasks/Ping-Logo/test.json"},
460
+ "cv:nba_train": {"img_dir": "cvtasks/Ping-NBA", "ann_file": "cvtasks/Ping-NBA/train.json"},
461
+ "cv:nba_test": {"img_dir": "cvtasks/Ping-NBA", "ann_file": "cvtasks/Ping-NBA/test.json"},
462
+ "cv:traffic_train": {"img_dir": "cvtasks/TrafficData/train.tsv", "ann_file": "cvtasks/TrafficData/train.json"},
463
+ "cv:traffic_test": {"img_dir": "cvtasks/TrafficData/test.tsv", "ann_file": "cvtasks/TrafficData/test.json"},
464
+ "cv:fashion5k_train": {"img_dir": "cvtasks/fashion5k", "ann_file": "cvtasks/fashion5k/train.json"},
465
+ "cv:fashion5k_test": {"img_dir": "cvtasks/fashion5k", "ann_file": "cvtasks/fashion5k/test.json"},
466
+ "cv:malaria_train": {"img_dir": "cvtasks/malaria", "ann_file": "cvtasks/malaria/train.json"},
467
+ "cv:malaria_test": {"img_dir": "cvtasks/malaria", "ann_file": "cvtasks/malaria/test.json"},
468
+ "cv:product_train": {
469
+ "img_dir": "cvtasks/product_detection",
470
+ "ann_file": "cvtasks/product_detection/train.json",
471
+ },
472
+ "cv:product_test": {"img_dir": "cvtasks/product_detection", "ann_file": "cvtasks/product_detection/test.json"},
473
+ "vl:vg_train": {"yaml_file": "vlp/visualgenome/train_vgoi6_clipped.yaml"},
474
+ "vl:vg_test": {"yaml_file": "vlp/visualgenome/test_vgoi6_clipped.yaml"},
475
+ "imagenet_train": {"img_dir": "imagenet-tsv/train.tsv", "ann_file": None},
476
+ "imagenet_val": {"img_dir": "imagenet-tsv/val.tsv", "ann_file": None},
477
+
478
+ "paco_lvis_v1_train_grounding":{
479
+ "img_dir": "coco",
480
+ "ann_file": "paco/paco_lvis_v1_train.json"
481
+ },
482
+
483
+ "paco_lvis_v1_val":{
484
+ "img_dir": "coco",
485
+ "ann_file": "paco/paco_lvis_v1_val.json"
486
+ },
487
+ "paco_lvis_v1_test":
488
+ {
489
+ "img_dir": "coco",
490
+ "ann_file": "paco/paco_lvis_v1_test.json"
491
+ },
492
+ "omnilabel_val": {"img_dir": "omnilabel/", "ann_file": "omnilabel/dataset_all_val_v0.1.3.json"},
493
+ "omnilabel_val_coco": {"img_dir": "omnilabel/", "ann_file": "omnilabel/dataset_all_val_v0.1.3_coco.json"},
494
+ "omnilabel_val_o365": {"img_dir": "omnilabel/", "ann_file": "omnilabel/dataset_all_val_v0.1.3_object365.json"},
495
+ "omnilabel_val_oi_v5": {"img_dir": "omnilabel/", "ann_file": "omnilabel/dataset_all_val_v0.1.3_openimagesv5.json"},
496
+ "omnilabel_test": {"img_dir": "omnilabel/", "ann_file": "omnilabel/dataset_all_test_v0.1.3.json"},
497
+ }
498
+
499
+ @staticmethod
500
+ def set(name, info):
501
+ DatasetCatalog.DATASETS.update({name: info})
502
+
503
+ @staticmethod
504
+ def get(name):
505
+
506
+ if name.endswith("_bg"):
507
+ attrs = DatasetCatalog.DATASETS[name]
508
+ data_dir = try_to_find(attrs["ann_file"], return_dir=True)
509
+ args = dict(
510
+ root=os.path.join(data_dir, attrs["img_dir"]),
511
+ ann_file=os.path.join(data_dir, attrs["ann_file"]),
512
+ )
513
+ return dict(
514
+ factory="Background",
515
+ args=args,
516
+ )
517
+ else:
518
+ if "bing" in name.split("_"):
519
+ attrs = DatasetCatalog.DATASETS["bing_caption_train"]
520
+ else:
521
+ attrs = DatasetCatalog.DATASETS[name]
522
+ # if "yaml_file" in attrs:
523
+ # yaml_file = try_to_find(attrs["yaml_file"], return_dir=False)
524
+ # args = dict(yaml_file=yaml_file)
525
+ # return dict(
526
+ # factory="VGTSVDataset",
527
+ # args=args,
528
+ # )
529
+ # elif attrs["img_dir"].endswith('tsv'):
530
+ # try:
531
+ # data_dir = try_to_find(attrs["img_dir"], return_dir=True)
532
+ # if attrs["ann_file"] is None:
533
+ # map_file = None
534
+ # elif attrs["ann_file"].startswith("./"):
535
+ # map_file = attrs["ann_file"]
536
+ # else:
537
+ # map_file = os.path.join(data_dir, attrs["ann_file"])
538
+ # except:
539
+ # return None
540
+ # args = dict(
541
+ # tsv_file=os.path.join(data_dir, attrs["img_dir"]),
542
+ # anno_file=map_file,
543
+ # )
544
+ # return dict(
545
+ # factory="TSVDataset",
546
+ # args=args,
547
+ # )
548
+ if "voc" in name and "split" in attrs:
549
+ data_dir = try_to_find(attrs["data_dir"], return_dir=True)
550
+ args = dict(
551
+ data_dir=os.path.join(data_dir, attrs["data_dir"]),
552
+ split=attrs["split"],
553
+ )
554
+ return dict(
555
+ factory="PascalVOCDataset",
556
+ args=args,
557
+ )
558
+ elif "omnilabel" in name:
559
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
560
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
561
+ args = dict(
562
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
563
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
564
+ )
565
+ return dict(
566
+ factory="OmniLabelDataset",
567
+ args=args,
568
+ )
569
+ elif "mixed" in name:
570
+ vg_img_dir = try_to_find(attrs["vg_img_dir"], return_dir=True)
571
+ coco_img_dir = try_to_find(attrs["coco_img_dir"], return_dir=True)
572
+ ann_file = try_to_find(attrs["ann_file"], return_dir=True)
573
+ args = dict(
574
+ img_folder_coco=os.path.join(coco_img_dir, attrs["coco_img_dir"]),
575
+ img_folder_vg=os.path.join(vg_img_dir, attrs["vg_img_dir"]),
576
+ ann_file=os.path.join(ann_file, attrs["ann_file"]),
577
+ )
578
+ return dict(
579
+ factory="MixedDataset",
580
+ args=args,
581
+ )
582
+ elif "flickr" in name:
583
+ img_dir = try_to_find(attrs["img_folder"], return_dir=True)
584
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
585
+ args = dict(
586
+ img_folder=os.path.join(img_dir, attrs["img_folder"]),
587
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
588
+ is_train=attrs["is_train"],
589
+ )
590
+ return dict(
591
+ factory="FlickrDataset",
592
+ args=args,
593
+ )
594
+ elif "refexp" in name or "refcoco" in name:
595
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
596
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
597
+ args = dict(
598
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
599
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
600
+ )
601
+ return dict(
602
+ factory="RefExpDataset",
603
+ args=args,
604
+ )
605
+ elif "gqa" in name:
606
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
607
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
608
+ args = dict(
609
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
610
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
611
+ )
612
+ return dict(
613
+ factory="GQADataset",
614
+ args=args,
615
+ )
616
+ elif "phrasecut" in name:
617
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
618
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
619
+ args = dict(
620
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
621
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
622
+ )
623
+ return dict(
624
+ factory="PhrasecutDetection",
625
+ args=args,
626
+ )
627
+ elif "_caption" in name:
628
+ yaml_path = try_to_find(attrs["yaml_path"], return_dir=True)
629
+ if "no_coco" in name:
630
+ yaml_name = attrs["yaml_name_no_coco"]
631
+ else:
632
+ yaml_name = attrs["yaml_name"]
633
+ yaml_file_name = "{}.{}.yaml".format(yaml_name, name.split("_")[2])
634
+ args = dict(yaml_file=os.path.join(yaml_path, attrs["yaml_path"], yaml_file_name))
635
+ return dict(
636
+ factory="CaptionTSV",
637
+ args=args,
638
+ )
639
+ elif "inferencecap" in name:
640
+ yaml_file_name = try_to_find(attrs["yaml_path"])
641
+ args = dict(yaml_file=yaml_file_name)
642
+ return dict(
643
+ factory="CaptionTSV",
644
+ args=args,
645
+ )
646
+ elif "pseudo_data" in name:
647
+ args = dict(yaml_file=try_to_find(attrs["yaml_path"]))
648
+ return dict(
649
+ factory="PseudoData",
650
+ args=args,
651
+ )
652
+ elif "_dt" in name:
653
+ dataset_file = attrs["dataset_file"]
654
+ yaml_path = try_to_find(attrs["yaml_path"], return_dir=True)
655
+ args = dict(
656
+ name=dataset_file,
657
+ yaml_file=os.path.join(yaml_path, attrs["yaml_path"]),
658
+ )
659
+ return dict(
660
+ factory="CocoDetectionTSV",
661
+ args=args,
662
+ )
663
+ elif "_odtsv" in name:
664
+ dataset_file = attrs["dataset_file"]
665
+ yaml_path = try_to_find(attrs["yaml_path"], return_dir=True)
666
+ args = dict(
667
+ name=dataset_file,
668
+ yaml_file=os.path.join(yaml_path, attrs["yaml_path"]),
669
+ )
670
+ return dict(
671
+ factory="ODTSVDataset",
672
+ args=args,
673
+ )
674
+ elif "_grounding" in name:
675
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
676
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
677
+ args = dict(
678
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
679
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
680
+ )
681
+ return dict(
682
+ factory="CocoGrounding",
683
+ args=args,
684
+ )
685
+ elif "lvis_evaluation" in name:
686
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
687
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
688
+ args = dict(
689
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
690
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
691
+ )
692
+ return dict(
693
+ factory="LvisDetection",
694
+ args=args,
695
+ )
696
+ elif "paco" in name:
697
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
698
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
699
+ args = dict(
700
+ img_folder=os.path.join(img_dir, attrs["img_dir"]),
701
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
702
+ )
703
+ return dict(
704
+ factory="PacoDetection",
705
+ args=args,
706
+ )
707
+ else:
708
+ ann_dir = try_to_find(attrs["ann_file"], return_dir=True)
709
+ img_dir = try_to_find(attrs["img_dir"], return_dir=True)
710
+ args = dict(
711
+ root=os.path.join(img_dir, attrs["img_dir"]),
712
+ ann_file=os.path.join(ann_dir, attrs["ann_file"]),
713
+ )
714
+ for k, v in attrs.items():
715
+ args.update({k: os.path.join(ann_dir, v)})
716
+ return dict(
717
+ factory="COCODataset",
718
+ args=args,
719
+ )
720
+
721
+ raise RuntimeError("Dataset not available: {}".format(name))
722
+
723
+
724
+ class ModelCatalog(object):
725
+ S3_C2_DETECTRON_URL = "https://dl.fbaipublicfiles.com/detectron"
726
+ C2_IMAGENET_MODELS = {
727
+ "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl",
728
+ "MSRA/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl",
729
+ "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl",
730
+ "MSRA/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl",
731
+ "FAIR/20171220/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl",
732
+ "FAIR/20171220/X-101-64x4d": "ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl",
733
+ }
734
+
735
+ C2_DETECTRON_SUFFIX = "output/train/coco_2014_train%3Acoco_2014_valminusminival/generalized_rcnn/model_final.pkl"
736
+ C2_DETECTRON_MODELS = {
737
+ "35857197/e2e_faster_rcnn_R-50-C4_1x": "01_33_49.iAX0mXvW",
738
+ "35857345/e2e_faster_rcnn_R-50-FPN_1x": "01_36_30.cUF7QR7I",
739
+ "35857890/e2e_faster_rcnn_R-101-FPN_1x": "01_38_50.sNxI7sX7",
740
+ "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "06_31_39.5MIHi1fZ",
741
+ "35858791/e2e_mask_rcnn_R-50-C4_1x": "01_45_57.ZgkA7hPB",
742
+ "35858933/e2e_mask_rcnn_R-50-FPN_1x": "01_48_14.DzEQe4wC",
743
+ "35861795/e2e_mask_rcnn_R-101-FPN_1x": "02_31_37.KqyEK4tT",
744
+ "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "06_35_59.RZotkLKI",
745
+ }
746
+
747
+ @staticmethod
748
+ def get(name):
749
+ if name.startswith("Caffe2Detectron/COCO"):
750
+ return ModelCatalog.get_c2_detectron_12_2017_baselines(name)
751
+ if name.startswith("ImageNetPretrained"):
752
+ return ModelCatalog.get_c2_imagenet_pretrained(name)
753
+ raise RuntimeError("model not present in the catalog {}".format(name))
754
+
755
+ @staticmethod
756
+ def get_c2_imagenet_pretrained(name):
757
+ prefix = ModelCatalog.S3_C2_DETECTRON_URL
758
+ name = name[len("ImageNetPretrained/") :]
759
+ name = ModelCatalog.C2_IMAGENET_MODELS[name]
760
+ url = "/".join([prefix, name])
761
+ return url
762
+
763
+ @staticmethod
764
+ def get_c2_detectron_12_2017_baselines(name):
765
+ # Detectron C2 models are stored following the structure
766
+ # prefix/<model_id>/2012_2017_baselines/<model_name>.yaml.<signature>/suffix
767
+ # we use as identifiers in the catalog Caffe2Detectron/COCO/<model_id>/<model_name>
768
+ prefix = ModelCatalog.S3_C2_DETECTRON_URL
769
+ suffix = ModelCatalog.C2_DETECTRON_SUFFIX
770
+ # remove identification prefix
771
+ name = name[len("Caffe2Detectron/COCO/") :]
772
+ # split in <model_id> and <model_name>
773
+ model_id, model_name = name.split("/")
774
+ # parsing to make it match the url address from the Caffe2 models
775
+ model_name = "{}.yaml".format(model_name)
776
+ signature = ModelCatalog.C2_DETECTRON_MODELS[name]
777
+ unique_name = ".".join([model_name, signature])
778
+ url = "/".join([prefix, model_id, "12_2017_baselines", unique_name, suffix])
779
+ return url
maskrcnn_benchmark/csrc/ROIAlign.h ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #pragma once
3
+
4
+ #include "cpu/vision.h"
5
+
6
+ #ifdef WITH_CUDA
7
+ #include "cuda/vision.h"
8
+ #endif
9
+
10
+ // Interface for Python
11
+ at::Tensor ROIAlign_forward(const at::Tensor& input,
12
+ const at::Tensor& rois,
13
+ const float spatial_scale,
14
+ const int pooled_height,
15
+ const int pooled_width,
16
+ const int sampling_ratio) {
17
+ if (input.device().is_cuda()) {
18
+ #ifdef WITH_CUDA
19
+ return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
20
+ #else
21
+ AT_ERROR("Not compiled with GPU support");
22
+ #endif
23
+ }
24
+ return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
25
+ }
26
+
27
+ at::Tensor ROIAlign_backward(const at::Tensor& grad,
28
+ const at::Tensor& rois,
29
+ const float spatial_scale,
30
+ const int pooled_height,
31
+ const int pooled_width,
32
+ const int batch_size,
33
+ const int channels,
34
+ const int height,
35
+ const int width,
36
+ const int sampling_ratio) {
37
+ if (grad.device().is_cuda()) {
38
+ #ifdef WITH_CUDA
39
+ return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio);
40
+ #else
41
+ AT_ERROR("Not compiled with GPU support");
42
+ #endif
43
+ }
44
+ AT_ERROR("Not implemented on the CPU");
45
+ }
46
+
maskrcnn_benchmark/csrc/ROIPool.h ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #pragma once
3
+
4
+ #include "cpu/vision.h"
5
+
6
+ #ifdef WITH_CUDA
7
+ #include "cuda/vision.h"
8
+ #endif
9
+
10
+
11
+ std::tuple<at::Tensor, at::Tensor> ROIPool_forward(const at::Tensor& input,
12
+ const at::Tensor& rois,
13
+ const float spatial_scale,
14
+ const int pooled_height,
15
+ const int pooled_width) {
16
+ if (input.device().is_cuda()) {
17
+ #ifdef WITH_CUDA
18
+ return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width);
19
+ #else
20
+ AT_ERROR("Not compiled with GPU support");
21
+ #endif
22
+ }
23
+ AT_ERROR("Not implemented on the CPU");
24
+ }
25
+
26
+ at::Tensor ROIPool_backward(const at::Tensor& grad,
27
+ const at::Tensor& input,
28
+ const at::Tensor& rois,
29
+ const at::Tensor& argmax,
30
+ const float spatial_scale,
31
+ const int pooled_height,
32
+ const int pooled_width,
33
+ const int batch_size,
34
+ const int channels,
35
+ const int height,
36
+ const int width) {
37
+ if (grad.device().is_cuda()) {
38
+ #ifdef WITH_CUDA
39
+ return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width);
40
+ #else
41
+ AT_ERROR("Not compiled with GPU support");
42
+ #endif
43
+ }
44
+ AT_ERROR("Not implemented on the CPU");
45
+ }
46
+
47
+
48
+
maskrcnn_benchmark/csrc/SigmoidFocalLoss.h ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include "cpu/vision.h"
4
+
5
+ #ifdef WITH_CUDA
6
+ #include "cuda/vision.h"
7
+ #endif
8
+
9
+ // Interface for Python
10
+ at::Tensor SigmoidFocalLoss_forward(
11
+ const at::Tensor& logits,
12
+ const at::Tensor& targets,
13
+ const int num_classes,
14
+ const float gamma,
15
+ const float alpha) {
16
+ if (logits.device().is_cuda()) {
17
+ #ifdef WITH_CUDA
18
+ return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, alpha);
19
+ #else
20
+ AT_ERROR("Not compiled with GPU support");
21
+ #endif
22
+ }
23
+ AT_ERROR("Not implemented on the CPU");
24
+ }
25
+
26
+ at::Tensor SigmoidFocalLoss_backward(
27
+ const at::Tensor& logits,
28
+ const at::Tensor& targets,
29
+ const at::Tensor& d_losses,
30
+ const int num_classes,
31
+ const float gamma,
32
+ const float alpha) {
33
+ if (logits.device().is_cuda()) {
34
+ #ifdef WITH_CUDA
35
+ return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, num_classes, gamma, alpha);
36
+ #else
37
+ AT_ERROR("Not compiled with GPU support");
38
+ #endif
39
+ }
40
+ AT_ERROR("Not implemented on the CPU");
41
+ }
maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include "cpu/vision.h"
3
+
4
+ // implementation taken from Caffe2
5
+ template <typename T>
6
+ struct PreCalc {
7
+ int pos1;
8
+ int pos2;
9
+ int pos3;
10
+ int pos4;
11
+ T w1;
12
+ T w2;
13
+ T w3;
14
+ T w4;
15
+ };
16
+
17
+ template <typename T>
18
+ void pre_calc_for_bilinear_interpolate(
19
+ const int height,
20
+ const int width,
21
+ const int pooled_height,
22
+ const int pooled_width,
23
+ const int iy_upper,
24
+ const int ix_upper,
25
+ T roi_start_h,
26
+ T roi_start_w,
27
+ T bin_size_h,
28
+ T bin_size_w,
29
+ int roi_bin_grid_h,
30
+ int roi_bin_grid_w,
31
+ std::vector<PreCalc<T>>& pre_calc) {
32
+ int pre_calc_index = 0;
33
+ for (int ph = 0; ph < pooled_height; ph++) {
34
+ for (int pw = 0; pw < pooled_width; pw++) {
35
+ for (int iy = 0; iy < iy_upper; iy++) {
36
+ const T yy = roi_start_h + ph * bin_size_h +
37
+ static_cast<T>(iy + .5f) * bin_size_h /
38
+ static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
39
+ for (int ix = 0; ix < ix_upper; ix++) {
40
+ const T xx = roi_start_w + pw * bin_size_w +
41
+ static_cast<T>(ix + .5f) * bin_size_w /
42
+ static_cast<T>(roi_bin_grid_w);
43
+
44
+ T x = xx;
45
+ T y = yy;
46
+ // deal with: inverse elements are out of feature map boundary
47
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
48
+ // empty
49
+ PreCalc<T> pc;
50
+ pc.pos1 = 0;
51
+ pc.pos2 = 0;
52
+ pc.pos3 = 0;
53
+ pc.pos4 = 0;
54
+ pc.w1 = 0;
55
+ pc.w2 = 0;
56
+ pc.w3 = 0;
57
+ pc.w4 = 0;
58
+ pre_calc[pre_calc_index] = pc;
59
+ pre_calc_index += 1;
60
+ continue;
61
+ }
62
+
63
+ if (y <= 0) {
64
+ y = 0;
65
+ }
66
+ if (x <= 0) {
67
+ x = 0;
68
+ }
69
+
70
+ int y_low = (int)y;
71
+ int x_low = (int)x;
72
+ int y_high;
73
+ int x_high;
74
+
75
+ if (y_low >= height - 1) {
76
+ y_high = y_low = height - 1;
77
+ y = (T)y_low;
78
+ } else {
79
+ y_high = y_low + 1;
80
+ }
81
+
82
+ if (x_low >= width - 1) {
83
+ x_high = x_low = width - 1;
84
+ x = (T)x_low;
85
+ } else {
86
+ x_high = x_low + 1;
87
+ }
88
+
89
+ T ly = y - y_low;
90
+ T lx = x - x_low;
91
+ T hy = 1. - ly, hx = 1. - lx;
92
+ T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
93
+
94
+ // save weights and indeces
95
+ PreCalc<T> pc;
96
+ pc.pos1 = y_low * width + x_low;
97
+ pc.pos2 = y_low * width + x_high;
98
+ pc.pos3 = y_high * width + x_low;
99
+ pc.pos4 = y_high * width + x_high;
100
+ pc.w1 = w1;
101
+ pc.w2 = w2;
102
+ pc.w3 = w3;
103
+ pc.w4 = w4;
104
+ pre_calc[pre_calc_index] = pc;
105
+
106
+ pre_calc_index += 1;
107
+ }
108
+ }
109
+ }
110
+ }
111
+ }
112
+
113
+ template <typename T>
114
+ void ROIAlignForward_cpu_kernel(
115
+ const int nthreads,
116
+ const T* bottom_data,
117
+ const T& spatial_scale,
118
+ const int channels,
119
+ const int height,
120
+ const int width,
121
+ const int pooled_height,
122
+ const int pooled_width,
123
+ const int sampling_ratio,
124
+ const T* bottom_rois,
125
+ //int roi_cols,
126
+ T* top_data) {
127
+ //AT_ASSERT(roi_cols == 4 || roi_cols == 5);
128
+ int roi_cols = 5;
129
+
130
+ int n_rois = nthreads / channels / pooled_width / pooled_height;
131
+ // (n, c, ph, pw) is an element in the pooled output
132
+ // can be parallelized using omp
133
+ // #pragma omp parallel for num_threads(32)
134
+ for (int n = 0; n < n_rois; n++) {
135
+ int index_n = n * channels * pooled_width * pooled_height;
136
+
137
+ // roi could have 4 or 5 columns
138
+ const T* offset_bottom_rois = bottom_rois + n * roi_cols;
139
+ int roi_batch_ind = 0;
140
+ if (roi_cols == 5) {
141
+ roi_batch_ind = offset_bottom_rois[0];
142
+ offset_bottom_rois++;
143
+ }
144
+
145
+ // Do not using rounding; this implementation detail is critical
146
+ T roi_start_w = offset_bottom_rois[0] * spatial_scale;
147
+ T roi_start_h = offset_bottom_rois[1] * spatial_scale;
148
+ T roi_end_w = offset_bottom_rois[2] * spatial_scale;
149
+ T roi_end_h = offset_bottom_rois[3] * spatial_scale;
150
+ // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
151
+ // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
152
+ // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
153
+ // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
154
+
155
+ // Force malformed ROIs to be 1x1
156
+ T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
157
+ T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
158
+ T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
159
+ T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
160
+
161
+ // We use roi_bin_grid to sample the grid and mimic integral
162
+ int roi_bin_grid_h = (sampling_ratio > 0)
163
+ ? sampling_ratio
164
+ : ceil(roi_height / pooled_height); // e.g., = 2
165
+ int roi_bin_grid_w =
166
+ (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
167
+
168
+ // We do average (integral) pooling inside a bin
169
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
170
+
171
+ // we want to precalculate indeces and weights shared by all chanels,
172
+ // this is the key point of optimiation
173
+ std::vector<PreCalc<T>> pre_calc(
174
+ roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
175
+ pre_calc_for_bilinear_interpolate(
176
+ height,
177
+ width,
178
+ pooled_height,
179
+ pooled_width,
180
+ roi_bin_grid_h,
181
+ roi_bin_grid_w,
182
+ roi_start_h,
183
+ roi_start_w,
184
+ bin_size_h,
185
+ bin_size_w,
186
+ roi_bin_grid_h,
187
+ roi_bin_grid_w,
188
+ pre_calc);
189
+
190
+ for (int c = 0; c < channels; c++) {
191
+ int index_n_c = index_n + c * pooled_width * pooled_height;
192
+ const T* offset_bottom_data =
193
+ bottom_data + (roi_batch_ind * channels + c) * height * width;
194
+ int pre_calc_index = 0;
195
+
196
+ for (int ph = 0; ph < pooled_height; ph++) {
197
+ for (int pw = 0; pw < pooled_width; pw++) {
198
+ int index = index_n_c + ph * pooled_width + pw;
199
+
200
+ T output_val = 0.;
201
+ for (int iy = 0; iy < roi_bin_grid_h; iy++) {
202
+ for (int ix = 0; ix < roi_bin_grid_w; ix++) {
203
+ PreCalc<T> pc = pre_calc[pre_calc_index];
204
+ output_val += pc.w1 * offset_bottom_data[pc.pos1] +
205
+ pc.w2 * offset_bottom_data[pc.pos2] +
206
+ pc.w3 * offset_bottom_data[pc.pos3] +
207
+ pc.w4 * offset_bottom_data[pc.pos4];
208
+
209
+ pre_calc_index += 1;
210
+ }
211
+ }
212
+ output_val /= count;
213
+
214
+ top_data[index] = output_val;
215
+ } // for pw
216
+ } // for ph
217
+ } // for c
218
+ } // for n
219
+ }
220
+
221
+ at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
222
+ const at::Tensor& rois,
223
+ const float spatial_scale,
224
+ const int pooled_height,
225
+ const int pooled_width,
226
+ const int sampling_ratio) {
227
+ AT_ASSERTM(!input.device().is_cuda(), "input must be a CPU tensor");
228
+ AT_ASSERTM(!rois.device().is_cuda(), "rois must be a CPU tensor");
229
+
230
+ auto num_rois = rois.size(0);
231
+ auto channels = input.size(1);
232
+ auto height = input.size(2);
233
+ auto width = input.size(3);
234
+
235
+ auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
236
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
237
+
238
+ if (output.numel() == 0) {
239
+ return output;
240
+ }
241
+
242
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
243
+ ROIAlignForward_cpu_kernel<scalar_t>(
244
+ output_size,
245
+ input.data_ptr<scalar_t>(),
246
+ spatial_scale,
247
+ channels,
248
+ height,
249
+ width,
250
+ pooled_height,
251
+ pooled_width,
252
+ sampling_ratio,
253
+ rois.data_ptr<scalar_t>(),
254
+ output.data_ptr<scalar_t>());
255
+ });
256
+ return output;
257
+ }
maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include "cpu/vision.h"
3
+
4
+
5
+ template <typename scalar_t>
6
+ at::Tensor nms_cpu_kernel(const at::Tensor& dets,
7
+ const at::Tensor& scores,
8
+ const float threshold) {
9
+ AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
10
+ AT_ASSERTM(!scores.device().is_cuda(), "scores must be a CPU tensor");
11
+ AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
12
+
13
+ if (dets.numel() == 0) {
14
+ return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
15
+ }
16
+
17
+ auto x1_t = dets.select(1, 0).contiguous();
18
+ auto y1_t = dets.select(1, 1).contiguous();
19
+ auto x2_t = dets.select(1, 2).contiguous();
20
+ auto y2_t = dets.select(1, 3).contiguous();
21
+
22
+ at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
23
+
24
+ auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
25
+
26
+ auto ndets = dets.size(0);
27
+ at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
28
+
29
+ auto suppressed = suppressed_t.data_ptr<uint8_t>();
30
+ auto order = order_t.data_ptr<int64_t>();
31
+ auto x1 = x1_t.data_ptr<scalar_t>();
32
+ auto y1 = y1_t.data_ptr<scalar_t>();
33
+ auto x2 = x2_t.data_ptr<scalar_t>();
34
+ auto y2 = y2_t.data_ptr<scalar_t>();
35
+ auto areas = areas_t.data_ptr<scalar_t>();
36
+
37
+ for (int64_t _i = 0; _i < ndets; _i++) {
38
+ auto i = order[_i];
39
+ if (suppressed[i] == 1)
40
+ continue;
41
+ auto ix1 = x1[i];
42
+ auto iy1 = y1[i];
43
+ auto ix2 = x2[i];
44
+ auto iy2 = y2[i];
45
+ auto iarea = areas[i];
46
+
47
+ for (int64_t _j = _i + 1; _j < ndets; _j++) {
48
+ auto j = order[_j];
49
+ if (suppressed[j] == 1)
50
+ continue;
51
+ auto xx1 = std::max(ix1, x1[j]);
52
+ auto yy1 = std::max(iy1, y1[j]);
53
+ auto xx2 = std::min(ix2, x2[j]);
54
+ auto yy2 = std::min(iy2, y2[j]);
55
+
56
+ auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1 + 1);
57
+ auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1 + 1);
58
+ auto inter = w * h;
59
+ auto ovr = inter / (iarea + areas[j] - inter);
60
+ if (ovr >= threshold)
61
+ suppressed[j] = 1;
62
+ }
63
+ }
64
+ return at::nonzero(suppressed_t == 0).squeeze(1);
65
+ }
66
+
67
+ at::Tensor nms_cpu(const at::Tensor& dets,
68
+ const at::Tensor& scores,
69
+ const float threshold) {
70
+ at::Tensor result;
71
+ AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] {
72
+ result = nms_cpu_kernel<scalar_t>(dets, scores, threshold);
73
+ });
74
+ return result;
75
+ }
maskrcnn_benchmark/csrc/cpu/soft_nms.cpp ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include "cpu/vision.h"
3
+
4
+
5
+ template <typename scalar_t>
6
+ std::pair<at::Tensor, at::Tensor> soft_nms_cpu_kernel(const at::Tensor& dets,
7
+ const at::Tensor& scores,
8
+ const float threshold,
9
+ const float sigma) {
10
+ AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
11
+ AT_ASSERTM(!scores.device().is_cuda(), "scores must be a CPU tensor");
12
+ AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores");
13
+
14
+ if (dets.numel() == 0) {
15
+ return std::make_pair(at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)),
16
+ at::empty({0}, scores.options().dtype(at::kFloat).device(at::kCPU)));
17
+ }
18
+
19
+ auto x1_t = dets.select(1, 0).contiguous();
20
+ auto y1_t = dets.select(1, 1).contiguous();
21
+ auto x2_t = dets.select(1, 2).contiguous();
22
+ auto y2_t = dets.select(1, 3).contiguous();
23
+
24
+ auto scores_t = scores.clone();
25
+
26
+ at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
27
+ auto ndets = dets.size(0);
28
+ auto inds_t = at::arange(ndets, dets.options().dtype(at::kLong).device(at::kCPU));
29
+
30
+ auto x1 = x1_t.data_ptr<scalar_t>();
31
+ auto y1 = y1_t.data_ptr<scalar_t>();
32
+ auto x2 = x2_t.data_ptr<scalar_t>();
33
+ auto y2 = y2_t.data_ptr<scalar_t>();
34
+ auto s = scores_t.data_ptr<scalar_t>();
35
+ auto inds = inds_t.data_ptr<int64_t>();
36
+ auto areas = areas_t.data_ptr<scalar_t>();
37
+
38
+ for (int64_t i = 0; i < ndets; i++) {
39
+
40
+ auto ix1 = x1[i];
41
+ auto iy1 = y1[i];
42
+ auto ix2 = x2[i];
43
+ auto iy2 = y2[i];
44
+ auto is = s[i];
45
+ auto ii = inds[i];
46
+ auto iarea = areas[i];
47
+
48
+ auto maxpos = scores_t.slice(0, i, ndets).argmax().item<int64_t>() + i;
49
+
50
+ // add max box as a detection
51
+ x1[i] = x1[maxpos];
52
+ y1[i] = y1[maxpos];
53
+ x2[i] = x2[maxpos];
54
+ y2[i] = y2[maxpos];
55
+ s[i] = s[maxpos];
56
+ inds[i] = inds[maxpos];
57
+ areas[i] = areas[maxpos];
58
+
59
+ // swap ith box with position of max box
60
+ x1[maxpos] = ix1;
61
+ y1[maxpos] = iy1;
62
+ x2[maxpos] = ix2;
63
+ y2[maxpos] = iy2;
64
+ s[maxpos] = is;
65
+ inds[maxpos] = ii;
66
+ areas[maxpos] = iarea;
67
+
68
+ ix1 = x1[i];
69
+ iy1 = y1[i];
70
+ ix2 = x2[i];
71
+ iy2 = y2[i];
72
+ iarea = areas[i];
73
+
74
+ // NMS iterations, note that ndets changes if detection boxes
75
+ // fall below threshold
76
+ for (int64_t j = i + 1; j < ndets; j++) {
77
+ auto xx1 = std::max(ix1, x1[j]);
78
+ auto yy1 = std::max(iy1, y1[j]);
79
+ auto xx2 = std::min(ix2, x2[j]);
80
+ auto yy2 = std::min(iy2, y2[j]);
81
+
82
+ auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1 + 1);
83
+ auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1 + 1);
84
+
85
+ auto inter = w * h;
86
+ auto ovr = inter / (iarea + areas[j] - inter);
87
+
88
+ s[j] = s[j] * std::exp(- std::pow(ovr, 2.0) / sigma);
89
+
90
+ // if box score falls below threshold, discard the box by
91
+ // swapping with last box update ndets
92
+ if (s[j] < threshold) {
93
+ x1[j] = x1[ndets - 1];
94
+ y1[j] = y1[ndets - 1];
95
+ x2[j] = x2[ndets - 1];
96
+ y2[j] = y2[ndets - 1];
97
+ s[j] = s[ndets - 1];
98
+ inds[j] = inds[ndets - 1];
99
+ areas[j] = areas[ndets - 1];
100
+ j--;
101
+ ndets--;
102
+ }
103
+ }
104
+ }
105
+ return std::make_pair(inds_t.slice(0, 0, ndets), scores_t.slice(0, 0, ndets));
106
+ }
107
+
108
+ std::pair<at::Tensor, at::Tensor> soft_nms_cpu(const at::Tensor& dets,
109
+ const at::Tensor& scores,
110
+ const float threshold,
111
+ const float sigma) {
112
+ std::pair<at::Tensor, at::Tensor> result;
113
+ AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "soft_nms", [&] {
114
+ result = soft_nms_cpu_kernel<scalar_t>(dets, scores, threshold, sigma);
115
+ });
116
+ return result;
117
+ }
maskrcnn_benchmark/csrc/cpu/vision.h ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #pragma once
3
+ #include <torch/extension.h>
4
+
5
+
6
+ at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
7
+ const at::Tensor& rois,
8
+ const float spatial_scale,
9
+ const int pooled_height,
10
+ const int pooled_width,
11
+ const int sampling_ratio);
12
+
13
+
14
+ at::Tensor nms_cpu(const at::Tensor& dets,
15
+ const at::Tensor& scores,
16
+ const float threshold);
17
+
18
+
19
+ std::pair<at::Tensor, at::Tensor> soft_nms_cpu(const at::Tensor& dets,
20
+ const at::Tensor& scores,
21
+ const float threshold,
22
+ const float sigma);
maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include <ATen/ATen.h>
3
+ #include <ATen/cuda/CUDAContext.h>
4
+
5
+ #include <THC/THC.h>
6
+ #include <THC/THCAtomics.cuh>
7
+ #include <THC/THCDeviceUtils.cuh>
8
+
9
+ // TODO make it in a common file
10
+ #define CUDA_1D_KERNEL_LOOP(i, n) \
11
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
12
+ i += blockDim.x * gridDim.x)
13
+
14
+
15
+ template <typename T>
16
+ __device__ T bilinear_interpolate(const T* bottom_data,
17
+ const int height, const int width,
18
+ T y, T x,
19
+ const int index /* index for debug only*/) {
20
+
21
+ // deal with cases that inverse elements are out of feature map boundary
22
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
23
+ //empty
24
+ return 0;
25
+ }
26
+
27
+ if (y <= 0) y = 0;
28
+ if (x <= 0) x = 0;
29
+
30
+ int y_low = (int) y;
31
+ int x_low = (int) x;
32
+ int y_high;
33
+ int x_high;
34
+
35
+ if (y_low >= height - 1) {
36
+ y_high = y_low = height - 1;
37
+ y = (T) y_low;
38
+ } else {
39
+ y_high = y_low + 1;
40
+ }
41
+
42
+ if (x_low >= width - 1) {
43
+ x_high = x_low = width - 1;
44
+ x = (T) x_low;
45
+ } else {
46
+ x_high = x_low + 1;
47
+ }
48
+
49
+ T ly = y - y_low;
50
+ T lx = x - x_low;
51
+ T hy = 1. - ly, hx = 1. - lx;
52
+ // do bilinear interpolation
53
+ T v1 = bottom_data[y_low * width + x_low];
54
+ T v2 = bottom_data[y_low * width + x_high];
55
+ T v3 = bottom_data[y_high * width + x_low];
56
+ T v4 = bottom_data[y_high * width + x_high];
57
+ T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
58
+
59
+ T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
60
+
61
+ return val;
62
+ }
63
+
64
+ template <typename T>
65
+ __global__ void RoIAlignForward(const int nthreads, const T* bottom_data,
66
+ const T spatial_scale, const int channels,
67
+ const int height, const int width,
68
+ const int pooled_height, const int pooled_width,
69
+ const int sampling_ratio,
70
+ const T* bottom_rois, T* top_data) {
71
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
72
+ // (n, c, ph, pw) is an element in the pooled output
73
+ int pw = index % pooled_width;
74
+ int ph = (index / pooled_width) % pooled_height;
75
+ int c = (index / pooled_width / pooled_height) % channels;
76
+ int n = index / pooled_width / pooled_height / channels;
77
+
78
+ const T* offset_bottom_rois = bottom_rois + n * 5;
79
+ int roi_batch_ind = offset_bottom_rois[0];
80
+
81
+ // Do not using rounding; this implementation detail is critical
82
+ T roi_start_w = offset_bottom_rois[1] * spatial_scale;
83
+ T roi_start_h = offset_bottom_rois[2] * spatial_scale;
84
+ T roi_end_w = offset_bottom_rois[3] * spatial_scale;
85
+ T roi_end_h = offset_bottom_rois[4] * spatial_scale;
86
+ // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
87
+ // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
88
+ // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
89
+ // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
90
+
91
+ // Force malformed ROIs to be 1x1
92
+ T roi_width = max(roi_end_w - roi_start_w, (T)1.);
93
+ T roi_height = max(roi_end_h - roi_start_h, (T)1.);
94
+ T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
95
+ T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
96
+
97
+ const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width;
98
+
99
+ // We use roi_bin_grid to sample the grid and mimic integral
100
+ int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
101
+ int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
102
+
103
+ // We do average (integral) pooling inside a bin
104
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
105
+
106
+ T output_val = 0.;
107
+ for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
108
+ {
109
+ const T y = roi_start_h + ph * bin_size_h + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
110
+ for (int ix = 0; ix < roi_bin_grid_w; ix ++)
111
+ {
112
+ const T x = roi_start_w + pw * bin_size_w + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
113
+
114
+ T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, index);
115
+ output_val += val;
116
+ }
117
+ }
118
+ output_val /= count;
119
+
120
+ top_data[index] = output_val;
121
+ }
122
+ }
123
+
124
+
125
+ template <typename T>
126
+ __device__ void bilinear_interpolate_gradient(
127
+ const int height, const int width,
128
+ T y, T x,
129
+ T & w1, T & w2, T & w3, T & w4,
130
+ int & x_low, int & x_high, int & y_low, int & y_high,
131
+ const int index /* index for debug only*/) {
132
+
133
+ // deal with cases that inverse elements are out of feature map boundary
134
+ if (y < -1.0 || y > height || x < -1.0 || x > width) {
135
+ //empty
136
+ w1 = w2 = w3 = w4 = 0.;
137
+ x_low = x_high = y_low = y_high = -1;
138
+ return;
139
+ }
140
+
141
+ if (y <= 0) y = 0;
142
+ if (x <= 0) x = 0;
143
+
144
+ y_low = (int) y;
145
+ x_low = (int) x;
146
+
147
+ if (y_low >= height - 1) {
148
+ y_high = y_low = height - 1;
149
+ y = (T) y_low;
150
+ } else {
151
+ y_high = y_low + 1;
152
+ }
153
+
154
+ if (x_low >= width - 1) {
155
+ x_high = x_low = width - 1;
156
+ x = (T) x_low;
157
+ } else {
158
+ x_high = x_low + 1;
159
+ }
160
+
161
+ T ly = y - y_low;
162
+ T lx = x - x_low;
163
+ T hy = 1. - ly, hx = 1. - lx;
164
+
165
+ // reference in forward
166
+ // T v1 = bottom_data[y_low * width + x_low];
167
+ // T v2 = bottom_data[y_low * width + x_high];
168
+ // T v3 = bottom_data[y_high * width + x_low];
169
+ // T v4 = bottom_data[y_high * width + x_high];
170
+ // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
171
+
172
+ w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
173
+
174
+ return;
175
+ }
176
+
177
+ template <typename T>
178
+ __global__ void RoIAlignBackwardFeature(const int nthreads, const T* top_diff,
179
+ const int num_rois, const T spatial_scale,
180
+ const int channels, const int height, const int width,
181
+ const int pooled_height, const int pooled_width,
182
+ const int sampling_ratio,
183
+ T* bottom_diff,
184
+ const T* bottom_rois) {
185
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
186
+ // (n, c, ph, pw) is an element in the pooled output
187
+ int pw = index % pooled_width;
188
+ int ph = (index / pooled_width) % pooled_height;
189
+ int c = (index / pooled_width / pooled_height) % channels;
190
+ int n = index / pooled_width / pooled_height / channels;
191
+
192
+ const T* offset_bottom_rois = bottom_rois + n * 5;
193
+ int roi_batch_ind = offset_bottom_rois[0];
194
+
195
+ // Do not using rounding; this implementation detail is critical
196
+ T roi_start_w = offset_bottom_rois[1] * spatial_scale;
197
+ T roi_start_h = offset_bottom_rois[2] * spatial_scale;
198
+ T roi_end_w = offset_bottom_rois[3] * spatial_scale;
199
+ T roi_end_h = offset_bottom_rois[4] * spatial_scale;
200
+ // T roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
201
+ // T roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
202
+ // T roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
203
+ // T roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
204
+
205
+ // Force malformed ROIs to be 1x1
206
+ T roi_width = max(roi_end_w - roi_start_w, (T)1.);
207
+ T roi_height = max(roi_end_h - roi_start_h, (T)1.);
208
+ T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
209
+ T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
210
+
211
+ T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width;
212
+
213
+ int top_offset = (n * channels + c) * pooled_height * pooled_width;
214
+ const T* offset_top_diff = top_diff + top_offset;
215
+ const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
216
+
217
+ // We use roi_bin_grid to sample the grid and mimic integral
218
+ int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2
219
+ int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
220
+
221
+ // We do average (integral) pooling inside a bin
222
+ const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
223
+
224
+ for (int iy = 0; iy < roi_bin_grid_h; iy ++) // e.g., iy = 0, 1
225
+ {
226
+ const T y = roi_start_h + ph * bin_size_h + static_cast<T>(iy + .5f) * bin_size_h / static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
227
+ for (int ix = 0; ix < roi_bin_grid_w; ix ++)
228
+ {
229
+ const T x = roi_start_w + pw * bin_size_w + static_cast<T>(ix + .5f) * bin_size_w / static_cast<T>(roi_bin_grid_w);
230
+
231
+ T w1, w2, w3, w4;
232
+ int x_low, x_high, y_low, y_high;
233
+
234
+ bilinear_interpolate_gradient(height, width, y, x,
235
+ w1, w2, w3, w4,
236
+ x_low, x_high, y_low, y_high,
237
+ index);
238
+
239
+ T g1 = top_diff_this_bin * w1 / count;
240
+ T g2 = top_diff_this_bin * w2 / count;
241
+ T g3 = top_diff_this_bin * w3 / count;
242
+ T g4 = top_diff_this_bin * w4 / count;
243
+
244
+ if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0)
245
+ {
246
+ atomicAdd(offset_bottom_diff + y_low * width + x_low, static_cast<T>(g1));
247
+ atomicAdd(offset_bottom_diff + y_low * width + x_high, static_cast<T>(g2));
248
+ atomicAdd(offset_bottom_diff + y_high * width + x_low, static_cast<T>(g3));
249
+ atomicAdd(offset_bottom_diff + y_high * width + x_high, static_cast<T>(g4));
250
+ } // if
251
+ } // ix
252
+ } // iy
253
+ } // CUDA_1D_KERNEL_LOOP
254
+ } // RoIAlignBackward
255
+
256
+
257
+ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
258
+ const at::Tensor& rois,
259
+ const float spatial_scale,
260
+ const int pooled_height,
261
+ const int pooled_width,
262
+ const int sampling_ratio) {
263
+ AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
264
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
265
+
266
+ auto num_rois = rois.size(0);
267
+ auto channels = input.size(1);
268
+ auto height = input.size(2);
269
+ auto width = input.size(3);
270
+
271
+ auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
272
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
273
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
274
+
275
+ dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
276
+ dim3 block(512);
277
+
278
+ if (output.numel() == 0) {
279
+ THCudaCheck(cudaGetLastError());
280
+ return output;
281
+ }
282
+
283
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
284
+ RoIAlignForward<scalar_t><<<grid, block, 0, stream>>>(
285
+ output_size,
286
+ input.contiguous().data_ptr<scalar_t>(),
287
+ spatial_scale,
288
+ channels,
289
+ height,
290
+ width,
291
+ pooled_height,
292
+ pooled_width,
293
+ sampling_ratio,
294
+ rois.contiguous().data_ptr<scalar_t>(),
295
+ output.data_ptr<scalar_t>());
296
+ });
297
+ THCudaCheck(cudaGetLastError());
298
+ return output;
299
+ }
300
+
301
+ // TODO remove the dependency on input and use instead its sizes -> save memory
302
+ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
303
+ const at::Tensor& rois,
304
+ const float spatial_scale,
305
+ const int pooled_height,
306
+ const int pooled_width,
307
+ const int batch_size,
308
+ const int channels,
309
+ const int height,
310
+ const int width,
311
+ const int sampling_ratio) {
312
+ AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
313
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
314
+
315
+ auto num_rois = rois.size(0);
316
+ auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
317
+
318
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
319
+
320
+ dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
321
+ dim3 block(512);
322
+
323
+ // handle possibly empty gradients
324
+ if (grad.numel() == 0) {
325
+ THCudaCheck(cudaGetLastError());
326
+ return grad_input;
327
+ }
328
+
329
+ AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] {
330
+ RoIAlignBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
331
+ grad.numel(),
332
+ grad.contiguous().data_ptr<scalar_t>(),
333
+ num_rois,
334
+ spatial_scale,
335
+ channels,
336
+ height,
337
+ width,
338
+ pooled_height,
339
+ pooled_width,
340
+ sampling_ratio,
341
+ grad_input.data_ptr<scalar_t>(),
342
+ rois.contiguous().data_ptr<scalar_t>());
343
+ });
344
+ THCudaCheck(cudaGetLastError());
345
+ return grad_input;
346
+ }
maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ #include <ATen/ATen.h>
3
+ #include <ATen/cuda/CUDAContext.h>
4
+
5
+ #include <THC/THC.h>
6
+ #include <THC/THCAtomics.cuh>
7
+ #include <THC/THCDeviceUtils.cuh>
8
+
9
+
10
+ // TODO make it in a common file
11
+ #define CUDA_1D_KERNEL_LOOP(i, n) \
12
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
13
+ i += blockDim.x * gridDim.x)
14
+
15
+
16
+ template <typename T>
17
+ __global__ void RoIPoolFForward(const int nthreads, const T* bottom_data,
18
+ const T spatial_scale, const int channels, const int height,
19
+ const int width, const int pooled_height, const int pooled_width,
20
+ const T* bottom_rois, T* top_data, int* argmax_data) {
21
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
22
+ // (n, c, ph, pw) is an element in the pooled output
23
+ int pw = index % pooled_width;
24
+ int ph = (index / pooled_width) % pooled_height;
25
+ int c = (index / pooled_width / pooled_height) % channels;
26
+ int n = index / pooled_width / pooled_height / channels;
27
+
28
+ const T* offset_bottom_rois = bottom_rois + n * 5;
29
+ int roi_batch_ind = offset_bottom_rois[0];
30
+ int roi_start_w = round(offset_bottom_rois[1] * spatial_scale);
31
+ int roi_start_h = round(offset_bottom_rois[2] * spatial_scale);
32
+ int roi_end_w = round(offset_bottom_rois[3] * spatial_scale);
33
+ int roi_end_h = round(offset_bottom_rois[4] * spatial_scale);
34
+
35
+ // Force malformed ROIs to be 1x1
36
+ int roi_width = max(roi_end_w - roi_start_w + 1, 1);
37
+ int roi_height = max(roi_end_h - roi_start_h + 1, 1);
38
+ T bin_size_h = static_cast<T>(roi_height)
39
+ / static_cast<T>(pooled_height);
40
+ T bin_size_w = static_cast<T>(roi_width)
41
+ / static_cast<T>(pooled_width);
42
+
43
+ int hstart = static_cast<int>(floor(static_cast<T>(ph)
44
+ * bin_size_h));
45
+ int wstart = static_cast<int>(floor(static_cast<T>(pw)
46
+ * bin_size_w));
47
+ int hend = static_cast<int>(ceil(static_cast<T>(ph + 1)
48
+ * bin_size_h));
49
+ int wend = static_cast<int>(ceil(static_cast<T>(pw + 1)
50
+ * bin_size_w));
51
+
52
+ // Add roi offsets and clip to input boundaries
53
+ hstart = min(max(hstart + roi_start_h, 0), height);
54
+ hend = min(max(hend + roi_start_h, 0), height);
55
+ wstart = min(max(wstart + roi_start_w, 0), width);
56
+ wend = min(max(wend + roi_start_w, 0), width);
57
+ bool is_empty = (hend <= hstart) || (wend <= wstart);
58
+
59
+ // Define an empty pooling region to be zero
60
+ T maxval = is_empty ? 0 : -FLT_MAX;
61
+ // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
62
+ int maxidx = -1;
63
+ const T* offset_bottom_data =
64
+ bottom_data + (roi_batch_ind * channels + c) * height * width;
65
+ for (int h = hstart; h < hend; ++h) {
66
+ for (int w = wstart; w < wend; ++w) {
67
+ int bottom_index = h * width + w;
68
+ if (offset_bottom_data[bottom_index] > maxval) {
69
+ maxval = offset_bottom_data[bottom_index];
70
+ maxidx = bottom_index;
71
+ }
72
+ }
73
+ }
74
+ top_data[index] = maxval;
75
+ argmax_data[index] = maxidx;
76
+ }
77
+ }
78
+
79
+ template <typename T>
80
+ __global__ void RoIPoolFBackward(const int nthreads, const T* top_diff,
81
+ const int* argmax_data, const int num_rois, const T spatial_scale,
82
+ const int channels, const int height, const int width,
83
+ const int pooled_height, const int pooled_width, T* bottom_diff,
84
+ const T* bottom_rois) {
85
+ CUDA_1D_KERNEL_LOOP(index, nthreads) {
86
+ // (n, c, ph, pw) is an element in the pooled output
87
+ int pw = index % pooled_width;
88
+ int ph = (index / pooled_width) % pooled_height;
89
+ int c = (index / pooled_width / pooled_height) % channels;
90
+ int n = index / pooled_width / pooled_height / channels;
91
+
92
+ const T* offset_bottom_rois = bottom_rois + n * 5;
93
+ int roi_batch_ind = offset_bottom_rois[0];
94
+ int bottom_offset = (roi_batch_ind * channels + c) * height * width;
95
+ int top_offset = (n * channels + c) * pooled_height * pooled_width;
96
+ const T* offset_top_diff = top_diff + top_offset;
97
+ T* offset_bottom_diff = bottom_diff + bottom_offset;
98
+ const int* offset_argmax_data = argmax_data + top_offset;
99
+
100
+ int argmax = offset_argmax_data[ph * pooled_width + pw];
101
+ if (argmax != -1) {
102
+ atomicAdd(
103
+ offset_bottom_diff + argmax,
104
+ static_cast<T>(offset_top_diff[ph * pooled_width + pw]));
105
+
106
+ }
107
+ }
108
+ }
109
+
110
+ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
111
+ const at::Tensor& rois,
112
+ const float spatial_scale,
113
+ const int pooled_height,
114
+ const int pooled_width) {
115
+ AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
116
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
117
+
118
+ auto num_rois = rois.size(0);
119
+ auto channels = input.size(1);
120
+ auto height = input.size(2);
121
+ auto width = input.size(3);
122
+
123
+ auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, input.options());
124
+ auto output_size = num_rois * pooled_height * pooled_width * channels;
125
+ auto argmax = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options().dtype(at::kInt));
126
+
127
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
128
+
129
+ dim3 grid(std::min(THCCeilDiv(output_size, 512L), 4096L));
130
+ dim3 block(512);
131
+
132
+ if (output.numel() == 0) {
133
+ THCudaCheck(cudaGetLastError());
134
+ return std::make_tuple(output, argmax);
135
+ }
136
+
137
+ AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIPool_forward", [&] {
138
+ RoIPoolFForward<scalar_t><<<grid, block, 0, stream>>>(
139
+ output_size,
140
+ input.contiguous().data_ptr<scalar_t>(),
141
+ spatial_scale,
142
+ channels,
143
+ height,
144
+ width,
145
+ pooled_height,
146
+ pooled_width,
147
+ rois.contiguous().data_ptr<scalar_t>(),
148
+ output.data_ptr<scalar_t>(),
149
+ argmax.data_ptr<int>());
150
+ });
151
+ THCudaCheck(cudaGetLastError());
152
+ return std::make_tuple(output, argmax);
153
+ }
154
+
155
+ // TODO remove the dependency on input and use instead its sizes -> save memory
156
+ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
157
+ const at::Tensor& input,
158
+ const at::Tensor& rois,
159
+ const at::Tensor& argmax,
160
+ const float spatial_scale,
161
+ const int pooled_height,
162
+ const int pooled_width,
163
+ const int batch_size,
164
+ const int channels,
165
+ const int height,
166
+ const int width) {
167
+ AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
168
+ AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
169
+ // TODO add more checks
170
+
171
+ auto num_rois = rois.size(0);
172
+ auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
173
+
174
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
175
+
176
+ dim3 grid(std::min(THCCeilDiv(grad.numel(), 512L), 4096L));
177
+ dim3 block(512);
178
+
179
+ // handle possibly empty gradients
180
+ if (grad.numel() == 0) {
181
+ THCudaCheck(cudaGetLastError());
182
+ return grad_input;
183
+ }
184
+
185
+ AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIPool_backward", [&] {
186
+ RoIPoolFBackward<scalar_t><<<grid, block, 0, stream>>>(
187
+ grad.numel(),
188
+ grad.contiguous().data_ptr<scalar_t>(),
189
+ argmax.data_ptr<int>(),
190
+ num_rois,
191
+ spatial_scale,
192
+ channels,
193
+ height,
194
+ width,
195
+ pooled_height,
196
+ pooled_width,
197
+ grad_input.data_ptr<scalar_t>(),
198
+ rois.contiguous().data_ptr<scalar_t>());
199
+ });
200
+ THCudaCheck(cudaGetLastError());
201
+ return grad_input;
202
+ }
maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
2
+ // This file is modified from https://github.com/pytorch/pytorch/blob/master/modules/detectron/sigmoid_focal_loss_op.cu
3
+ // Cheng-Yang Fu
4
+ // cyfu@cs.unc.edu
5
+ #include <ATen/ATen.h>
6
+ #include <ATen/cuda/CUDAContext.h>
7
+
8
+ #include <THC/THC.h>
9
+ #include <THC/THCAtomics.cuh>
10
+ #include <THC/THCDeviceUtils.cuh>
11
+
12
+ #include <cfloat>
13
+
14
+ // TODO make it in a common file
15
+ #define CUDA_1D_KERNEL_LOOP(i, n) \
16
+ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
17
+ i += blockDim.x * gridDim.x)
18
+
19
+
20
+ template <typename T>
21
+ __global__ void SigmoidFocalLossForward(const int nthreads,
22
+ const T* logits,
23
+ const int* targets,
24
+ const int num_classes,
25
+ const float gamma,
26
+ const float alpha,
27
+ const int num,
28
+ T* losses) {
29
+ CUDA_1D_KERNEL_LOOP(i, nthreads) {
30
+
31
+ int n = i / num_classes;
32
+ int d = i % num_classes; // current class[0~79];
33
+ int t = targets[n]; // target class [1~80];
34
+
35
+ // Decide it is positive or negative case.
36
+ T c1 = (t == (d+1));
37
+ T c2 = (t>=0 & t != (d+1));
38
+
39
+ T zn = (1.0 - alpha);
40
+ T zp = (alpha);
41
+
42
+ // p = 1. / 1. + expf(-x); p = sigmoid(x)
43
+ T p = 1. / (1. + expf(-logits[i]));
44
+
45
+ // (1-p)**gamma * log(p) where
46
+ T term1 = powf((1. - p), gamma) * logf(max(p, FLT_MIN));
47
+
48
+ // p**gamma * log(1-p)
49
+ T term2 = powf(p, gamma) *
50
+ (-1. * logits[i] * (logits[i] >= 0) -
51
+ logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0))));
52
+
53
+ losses[i] = 0.0;
54
+ losses[i] += -c1 * term1 * zp;
55
+ losses[i] += -c2 * term2 * zn;
56
+
57
+ } // CUDA_1D_KERNEL_LOOP
58
+ } // SigmoidFocalLossForward
59
+
60
+
61
+ template <typename T>
62
+ __global__ void SigmoidFocalLossBackward(const int nthreads,
63
+ const T* logits,
64
+ const int* targets,
65
+ const T* d_losses,
66
+ const int num_classes,
67
+ const float gamma,
68
+ const float alpha,
69
+ const int num,
70
+ T* d_logits) {
71
+ CUDA_1D_KERNEL_LOOP(i, nthreads) {
72
+
73
+ int n = i / num_classes;
74
+ int d = i % num_classes; // current class[0~79];
75
+ int t = targets[n]; // target class [1~80], 0 is background;
76
+
77
+ // Decide it is positive or negative case.
78
+ T c1 = (t == (d+1));
79
+ T c2 = (t>=0 & t != (d+1));
80
+
81
+ T zn = (1.0 - alpha);
82
+ T zp = (alpha);
83
+ // p = 1. / 1. + expf(-x); p = sigmoid(x)
84
+ T p = 1. / (1. + expf(-logits[i]));
85
+
86
+ // (1-p)**g * (1 - p - g*p*log(p)
87
+ T term1 = powf((1. - p), gamma) *
88
+ (1. - p - (p * gamma * logf(max(p, FLT_MIN))));
89
+
90
+ // (p**g) * (g*(1-p)*log(1-p) - p)
91
+ T term2 = powf(p, gamma) *
92
+ ((-1. * logits[i] * (logits[i] >= 0) -
93
+ logf(1. + expf(logits[i] - 2. * logits[i] * (logits[i] >= 0)))) *
94
+ (1. - p) * gamma - p);
95
+ d_logits[i] = 0.0;
96
+ d_logits[i] += -c1 * term1 * zp;
97
+ d_logits[i] += -c2 * term2 * zn;
98
+ d_logits[i] = d_logits[i] * d_losses[i];
99
+
100
+ } // CUDA_1D_KERNEL_LOOP
101
+ } // SigmoidFocalLossBackward
102
+
103
+
104
+ at::Tensor SigmoidFocalLoss_forward_cuda(
105
+ const at::Tensor& logits,
106
+ const at::Tensor& targets,
107
+ const int num_classes,
108
+ const float gamma,
109
+ const float alpha) {
110
+ AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
111
+ AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
112
+ AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
113
+
114
+ const int num_samples = logits.size(0);
115
+
116
+ auto losses = at::empty({num_samples, logits.size(1)}, logits.options());
117
+ auto losses_size = num_samples * logits.size(1);
118
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
119
+
120
+ dim3 grid(std::min(THCCeilDiv(losses_size, 512L), 4096L));
121
+ dim3 block(512);
122
+
123
+ if (losses.numel() == 0) {
124
+ THCudaCheck(cudaGetLastError());
125
+ return losses;
126
+ }
127
+
128
+ AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_forward", [&] {
129
+ SigmoidFocalLossForward<scalar_t><<<grid, block, 0, stream>>>(
130
+ losses_size,
131
+ logits.contiguous().data_ptr<scalar_t>(),
132
+ targets.contiguous().data_ptr<int>(),
133
+ num_classes,
134
+ gamma,
135
+ alpha,
136
+ num_samples,
137
+ losses.data_ptr<scalar_t>());
138
+ });
139
+ THCudaCheck(cudaGetLastError());
140
+ return losses;
141
+ }
142
+
143
+
144
+ at::Tensor SigmoidFocalLoss_backward_cuda(
145
+ const at::Tensor& logits,
146
+ const at::Tensor& targets,
147
+ const at::Tensor& d_losses,
148
+ const int num_classes,
149
+ const float gamma,
150
+ const float alpha) {
151
+ AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
152
+ AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
153
+ AT_ASSERTM(d_losses.device().is_cuda(), "d_losses must be a CUDA tensor");
154
+
155
+ AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
156
+
157
+ const int num_samples = logits.size(0);
158
+ AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes");
159
+
160
+ auto d_logits = at::zeros({num_samples, num_classes}, logits.options());
161
+ auto d_logits_size = num_samples * logits.size(1);
162
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream();
163
+
164
+ dim3 grid(std::min(THCCeilDiv(d_logits_size, 512L), 4096L));
165
+ dim3 block(512);
166
+
167
+ if (d_logits.numel() == 0) {
168
+ THCudaCheck(cudaGetLastError());
169
+ return d_logits;
170
+ }
171
+
172
+ AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_backward", [&] {
173
+ SigmoidFocalLossBackward<scalar_t><<<grid, block, 0, stream>>>(
174
+ d_logits_size,
175
+ logits.contiguous().data_ptr<scalar_t>(),
176
+ targets.contiguous().data_ptr<int>(),
177
+ d_losses.contiguous().data_ptr<scalar_t>(),
178
+ num_classes,
179
+ gamma,
180
+ alpha,
181
+ num_samples,
182
+ d_logits.data_ptr<scalar_t>());
183
+ });
184
+
185
+ THCudaCheck(cudaGetLastError());
186
+ return d_logits;
187
+ }
188
+
maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu ADDED
@@ -0,0 +1,691 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // modify from
2
+ // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
3
+
4
+ #include <ATen/ATen.h>
5
+ #include <ATen/cuda/CUDAContext.h>
6
+
7
+ #include <THC/THC.h>
8
+ #include <THC/THCDeviceUtils.cuh>
9
+
10
+ #include <vector>
11
+ #include <iostream>
12
+ #include <cmath>
13
+
14
+
15
+ void deformable_im2col(const at::Tensor data_im, const at::Tensor data_offset,
16
+ const int channels, const int height, const int width,
17
+ const int ksize_h, const int ksize_w, const int pad_h,
18
+ const int pad_w, const int stride_h, const int stride_w,
19
+ const int dilation_h, const int dilation_w,
20
+ const int parallel_imgs, const int deformable_group,
21
+ at::Tensor data_col);
22
+
23
+ void deformable_col2im(const at::Tensor data_col, const at::Tensor data_offset,
24
+ const int channels, const int height, const int width,
25
+ const int ksize_h, const int ksize_w, const int pad_h,
26
+ const int pad_w, const int stride_h, const int stride_w,
27
+ const int dilation_h, const int dilation_w,
28
+ const int parallel_imgs, const int deformable_group,
29
+ at::Tensor grad_im);
30
+
31
+ void deformable_col2im_coord(
32
+ const at::Tensor data_col, const at::Tensor data_im,
33
+ const at::Tensor data_offset, const int channels, const int height,
34
+ const int width, const int ksize_h, const int ksize_w, const int pad_h,
35
+ const int pad_w, const int stride_h, const int stride_w,
36
+ const int dilation_h, const int dilation_w, const int parallel_imgs,
37
+ const int deformable_group, at::Tensor grad_offset);
38
+
39
+ void modulated_deformable_im2col_cuda(
40
+ const at::Tensor data_im, const at::Tensor data_offset,
41
+ const at::Tensor data_mask, const int batch_size, const int channels,
42
+ const int height_im, const int width_im, const int height_col,
43
+ const int width_col, const int kernel_h, const int kenerl_w,
44
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
45
+ const int dilation_h, const int dilation_w, const int deformable_group,
46
+ at::Tensor data_col);
47
+
48
+ void modulated_deformable_col2im_cuda(
49
+ const at::Tensor data_col, const at::Tensor data_offset,
50
+ const at::Tensor data_mask, const int batch_size, const int channels,
51
+ const int height_im, const int width_im, const int height_col,
52
+ const int width_col, const int kernel_h, const int kenerl_w,
53
+ const int pad_h, const int pad_w, const int stride_h, const int stride_w,
54
+ const int dilation_h, const int dilation_w, const int deformable_group,
55
+ at::Tensor grad_im);
56
+
57
+ void modulated_deformable_col2im_coord_cuda(
58
+ const at::Tensor data_col, const at::Tensor data_im,
59
+ const at::Tensor data_offset, const at::Tensor data_mask,
60
+ const int batch_size, const int channels, const int height_im,
61
+ const int width_im, const int height_col, const int width_col,
62
+ const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
63
+ const int stride_h, const int stride_w, const int dilation_h,
64
+ const int dilation_w, const int deformable_group, at::Tensor grad_offset,
65
+ at::Tensor grad_mask);
66
+
67
+ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
68
+ at::Tensor weight, int kH, int kW, int dH, int dW, int padH,
69
+ int padW, int dilationH, int dilationW, int group,
70
+ int deformable_group)
71
+ {
72
+ TORCH_CHECK(weight.ndimension() == 4,
73
+ "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
74
+ "but got: %s",
75
+ weight.ndimension());
76
+
77
+ TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
78
+
79
+ TORCH_CHECK(kW > 0 && kH > 0,
80
+ "kernel size should be greater than zero, but got kH: %d kW: %d", kH,
81
+ kW);
82
+
83
+ TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
84
+ "kernel size should be consistent with weight, ",
85
+ "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
86
+ kW, weight.size(2), weight.size(3));
87
+
88
+ TORCH_CHECK(dW > 0 && dH > 0,
89
+ "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
90
+
91
+ TORCH_CHECK(
92
+ dilationW > 0 && dilationH > 0,
93
+ "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
94
+ dilationH, dilationW);
95
+
96
+ int ndim = input.ndimension();
97
+ int dimf = 0;
98
+ int dimh = 1;
99
+ int dimw = 2;
100
+
101
+ if (ndim == 4) {
102
+ dimf++;
103
+ dimh++;
104
+ dimw++;
105
+ }
106
+
107
+ TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
108
+ ndim);
109
+
110
+ long nInputPlane = weight.size(1) * group;
111
+ long inputHeight = input.size(dimh);
112
+ long inputWidth = input.size(dimw);
113
+ long nOutputPlane = weight.size(0);
114
+ long outputHeight =
115
+ (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
116
+ long outputWidth =
117
+ (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
118
+
119
+ TORCH_CHECK(nInputPlane % deformable_group == 0,
120
+ "input channels must divide deformable group size");
121
+
122
+ if (outputWidth < 1 || outputHeight < 1)
123
+ AT_ERROR(
124
+ "Given input size: (%ld x %ld x %ld). "
125
+ "Calculated output size: (%ld x %ld x %ld). Output size is too small",
126
+ nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
127
+ outputWidth);
128
+
129
+ TORCH_CHECK(input.size(1) == nInputPlane,
130
+ "invalid number of input planes, expected: %d, but got: %d",
131
+ nInputPlane, input.size(1));
132
+
133
+ TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
134
+ "input image is smaller than kernel");
135
+
136
+ TORCH_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
137
+ "invalid spatial size of offset, expected height: %d width: %d, but "
138
+ "got height: %d width: %d",
139
+ outputHeight, outputWidth, offset.size(2), offset.size(3));
140
+
141
+ TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
142
+ "invalid number of channels of offset");
143
+
144
+ if (gradOutput != NULL) {
145
+ TORCH_CHECK(gradOutput->size(dimf) == nOutputPlane,
146
+ "invalid number of gradOutput planes, expected: %d, but got: %d",
147
+ nOutputPlane, gradOutput->size(dimf));
148
+
149
+ TORCH_CHECK((gradOutput->size(dimh) == outputHeight &&
150
+ gradOutput->size(dimw) == outputWidth),
151
+ "invalid size of gradOutput, expected height: %d width: %d , but "
152
+ "got height: %d width: %d",
153
+ outputHeight, outputWidth, gradOutput->size(dimh),
154
+ gradOutput->size(dimw));
155
+ }
156
+ }
157
+
158
+ int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
159
+ at::Tensor offset, at::Tensor output,
160
+ at::Tensor columns, at::Tensor ones, int kW,
161
+ int kH, int dW, int dH, int padW, int padH,
162
+ int dilationW, int dilationH, int group,
163
+ int deformable_group, int im2col_step)
164
+ {
165
+ // todo: resize columns to include im2col: done
166
+ // todo: add im2col_step as input
167
+ // todo: add new output buffer and transpose it to output (or directly
168
+ // transpose output) todo: possibly change data indexing because of
169
+ // parallel_imgs
170
+
171
+ shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH, padW,
172
+ dilationH, dilationW, group, deformable_group);
173
+
174
+ input = input.contiguous();
175
+ offset = offset.contiguous();
176
+ weight = weight.contiguous();
177
+
178
+ int batch = 1;
179
+ if (input.ndimension() == 3) {
180
+ // Force batch
181
+ batch = 0;
182
+ input.unsqueeze_(0);
183
+ offset.unsqueeze_(0);
184
+ }
185
+
186
+ // todo: assert batchsize dividable by im2col_step
187
+
188
+ long batchSize = input.size(0);
189
+ long nInputPlane = input.size(1);
190
+ long inputHeight = input.size(2);
191
+ long inputWidth = input.size(3);
192
+
193
+ long nOutputPlane = weight.size(0);
194
+
195
+ long outputWidth =
196
+ (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
197
+ long outputHeight =
198
+ (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
199
+
200
+ TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
201
+
202
+ output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
203
+ outputHeight, outputWidth});
204
+ columns = at::zeros(
205
+ {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
206
+ input.options());
207
+
208
+ if (ones.ndimension() != 2 ||
209
+ ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
210
+ ones = at::ones({outputHeight, outputWidth}, input.options());
211
+ }
212
+
213
+ input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
214
+ inputHeight, inputWidth});
215
+ offset =
216
+ offset.view({batchSize / im2col_step, im2col_step,
217
+ deformable_group * 2 * kH * kW, outputHeight, outputWidth});
218
+
219
+ at::Tensor output_buffer =
220
+ at::zeros({batchSize / im2col_step, nOutputPlane,
221
+ im2col_step * outputHeight, outputWidth},
222
+ output.options());
223
+
224
+ output_buffer = output_buffer.view(
225
+ {output_buffer.size(0), group, output_buffer.size(1) / group,
226
+ output_buffer.size(2), output_buffer.size(3)});
227
+
228
+ for (int elt = 0; elt < batchSize / im2col_step; elt++) {
229
+ deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
230
+ inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
231
+ dilationW, im2col_step, deformable_group, columns);
232
+
233
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
234
+ weight = weight.view({group, weight.size(0) / group, weight.size(1),
235
+ weight.size(2), weight.size(3)});
236
+
237
+ for (int g = 0; g < group; g++) {
238
+ output_buffer[elt][g] = output_buffer[elt][g]
239
+ .flatten(1)
240
+ .addmm_(weight[g].flatten(1), columns[g])
241
+ .view_as(output_buffer[elt][g]);
242
+ }
243
+ }
244
+
245
+ output_buffer = output_buffer.view(
246
+ {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
247
+ output_buffer.size(3), output_buffer.size(4)});
248
+
249
+ output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
250
+ im2col_step, outputHeight, outputWidth});
251
+ output_buffer.transpose_(1, 2);
252
+ output.copy_(output_buffer);
253
+ output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
254
+
255
+ input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
256
+ offset = offset.view(
257
+ {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
258
+
259
+ if (batch == 0) {
260
+ output = output.view({nOutputPlane, outputHeight, outputWidth});
261
+ input = input.view({nInputPlane, inputHeight, inputWidth});
262
+ offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
263
+ }
264
+
265
+ return 1;
266
+ }
267
+
268
+ int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
269
+ at::Tensor gradOutput, at::Tensor gradInput,
270
+ at::Tensor gradOffset, at::Tensor weight,
271
+ at::Tensor columns, int kW, int kH, int dW,
272
+ int dH, int padW, int padH, int dilationW,
273
+ int dilationH, int group,
274
+ int deformable_group, int im2col_step)
275
+ {
276
+ shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW, padH, padW,
277
+ dilationH, dilationW, group, deformable_group);
278
+
279
+ input = input.contiguous();
280
+ offset = offset.contiguous();
281
+ gradOutput = gradOutput.contiguous();
282
+ weight = weight.contiguous();
283
+
284
+ int batch = 1;
285
+
286
+ if (input.ndimension() == 3) {
287
+ // Force batch
288
+ batch = 0;
289
+ input = input.view({1, input.size(0), input.size(1), input.size(2)});
290
+ offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
291
+ gradOutput = gradOutput.view(
292
+ {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
293
+ }
294
+
295
+ long batchSize = input.size(0);
296
+ long nInputPlane = input.size(1);
297
+ long inputHeight = input.size(2);
298
+ long inputWidth = input.size(3);
299
+
300
+ long nOutputPlane = weight.size(0);
301
+
302
+ long outputWidth =
303
+ (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
304
+ long outputHeight =
305
+ (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
306
+
307
+ TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
308
+ gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
309
+ columns = at::zeros(
310
+ {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
311
+ input.options());
312
+
313
+ // change order of grad output
314
+ gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
315
+ nOutputPlane, outputHeight, outputWidth});
316
+ gradOutput.transpose_(1, 2);
317
+
318
+ gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
319
+ inputHeight, inputWidth});
320
+ input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
321
+ inputHeight, inputWidth});
322
+ gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
323
+ deformable_group * 2 * kH * kW, outputHeight,
324
+ outputWidth});
325
+ offset =
326
+ offset.view({batchSize / im2col_step, im2col_step,
327
+ deformable_group * 2 * kH * kW, outputHeight, outputWidth});
328
+
329
+ for (int elt = 0; elt < batchSize / im2col_step; elt++) {
330
+ // divide into groups
331
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
332
+ weight = weight.view({group, weight.size(0) / group, weight.size(1),
333
+ weight.size(2), weight.size(3)});
334
+ gradOutput = gradOutput.view(
335
+ {gradOutput.size(0), group, gradOutput.size(1) / group,
336
+ gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
337
+
338
+ for (int g = 0; g < group; g++) {
339
+ columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
340
+ gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
341
+ }
342
+
343
+ columns =
344
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
345
+ gradOutput = gradOutput.view(
346
+ {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
347
+ gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
348
+
349
+ deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
350
+ inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
351
+ dilationH, dilationW, im2col_step, deformable_group,
352
+ gradOffset[elt]);
353
+
354
+ deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
355
+ inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
356
+ dilationW, im2col_step, deformable_group, gradInput[elt]);
357
+ }
358
+
359
+ gradOutput.transpose_(1, 2);
360
+ gradOutput =
361
+ gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
362
+
363
+ gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
364
+ input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
365
+ gradOffset = gradOffset.view(
366
+ {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
367
+ offset = offset.view(
368
+ {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
369
+
370
+ if (batch == 0) {
371
+ gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
372
+ input = input.view({nInputPlane, inputHeight, inputWidth});
373
+ gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
374
+ offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
375
+ gradOffset =
376
+ gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
377
+ }
378
+
379
+ return 1;
380
+ }
381
+
382
+ int deform_conv_backward_parameters_cuda(
383
+ at::Tensor input, at::Tensor offset, at::Tensor gradOutput,
384
+ at::Tensor gradWeight, // at::Tensor gradBias,
385
+ at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH,
386
+ int padW, int padH, int dilationW, int dilationH, int group,
387
+ int deformable_group, float scale, int im2col_step)
388
+ {
389
+ // todo: transpose and reshape outGrad
390
+ // todo: reshape columns
391
+ // todo: add im2col_step as input
392
+
393
+ shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH, dW, padH,
394
+ padW, dilationH, dilationW, group, deformable_group);
395
+
396
+ input = input.contiguous();
397
+ offset = offset.contiguous();
398
+ gradOutput = gradOutput.contiguous();
399
+
400
+ int batch = 1;
401
+
402
+ if (input.ndimension() == 3) {
403
+ // Force batch
404
+ batch = 0;
405
+ input = input.view(
406
+ at::IntList({1, input.size(0), input.size(1), input.size(2)}));
407
+ gradOutput = gradOutput.view(
408
+ {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
409
+ }
410
+
411
+ long batchSize = input.size(0);
412
+ long nInputPlane = input.size(1);
413
+ long inputHeight = input.size(2);
414
+ long inputWidth = input.size(3);
415
+
416
+ long nOutputPlane = gradWeight.size(0);
417
+
418
+ long outputWidth =
419
+ (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
420
+ long outputHeight =
421
+ (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
422
+
423
+ TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
424
+
425
+ columns = at::zeros(
426
+ {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
427
+ input.options());
428
+
429
+ gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
430
+ nOutputPlane, outputHeight, outputWidth});
431
+ gradOutput.transpose_(1, 2);
432
+
433
+ at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
434
+ gradOutputBuffer =
435
+ gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
436
+ outputHeight, outputWidth});
437
+ gradOutputBuffer.copy_(gradOutput);
438
+ gradOutputBuffer =
439
+ gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
440
+ im2col_step * outputHeight, outputWidth});
441
+
442
+ gradOutput.transpose_(1, 2);
443
+ gradOutput =
444
+ gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
445
+
446
+ input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
447
+ inputHeight, inputWidth});
448
+ offset =
449
+ offset.view({batchSize / im2col_step, im2col_step,
450
+ deformable_group * 2 * kH * kW, outputHeight, outputWidth});
451
+
452
+ for (int elt = 0; elt < batchSize / im2col_step; elt++) {
453
+ deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
454
+ inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
455
+ dilationW, im2col_step, deformable_group, columns);
456
+
457
+ // divide into group
458
+ gradOutputBuffer = gradOutputBuffer.view(
459
+ {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
460
+ gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
461
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
462
+ gradWeight =
463
+ gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
464
+ gradWeight.size(2), gradWeight.size(3)});
465
+
466
+ for (int g = 0; g < group; g++) {
467
+ gradWeight[g] = gradWeight[g]
468
+ .flatten(1)
469
+ .addmm_(gradOutputBuffer[elt][g].flatten(1),
470
+ columns[g].transpose(1, 0), 1.0, scale)
471
+ .view_as(gradWeight[g]);
472
+ }
473
+ gradOutputBuffer = gradOutputBuffer.view(
474
+ {gradOutputBuffer.size(0),
475
+ gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
476
+ gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
477
+ columns =
478
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
479
+ gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
480
+ gradWeight.size(2), gradWeight.size(3),
481
+ gradWeight.size(4)});
482
+ }
483
+
484
+ input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
485
+ offset = offset.view(
486
+ {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
487
+
488
+ if (batch == 0) {
489
+ gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
490
+ input = input.view({nInputPlane, inputHeight, inputWidth});
491
+ }
492
+
493
+ return 1;
494
+ }
495
+
496
+ void modulated_deform_conv_cuda_forward(
497
+ at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
498
+ at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns,
499
+ int kernel_h, int kernel_w, const int stride_h, const int stride_w,
500
+ const int pad_h, const int pad_w, const int dilation_h,
501
+ const int dilation_w, const int group, const int deformable_group,
502
+ const bool with_bias)
503
+ {
504
+ TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
505
+ TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
506
+
507
+ const int batch = input.size(0);
508
+ const int channels = input.size(1);
509
+ const int height = input.size(2);
510
+ const int width = input.size(3);
511
+
512
+ const int channels_out = weight.size(0);
513
+ const int channels_kernel = weight.size(1);
514
+ const int kernel_h_ = weight.size(2);
515
+ const int kernel_w_ = weight.size(3);
516
+
517
+ if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
518
+ AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
519
+ kernel_h_, kernel_w, kernel_h_, kernel_w_);
520
+ if (channels != channels_kernel * group)
521
+ AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
522
+ channels, channels_kernel * group);
523
+
524
+ const int height_out =
525
+ (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
526
+ const int width_out =
527
+ (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
528
+
529
+ if (ones.ndimension() != 2 ||
530
+ ones.size(0) * ones.size(1) < height_out * width_out) {
531
+ // Resize plane and fill with ones...
532
+ ones = at::ones({height_out, width_out}, input.options());
533
+ }
534
+
535
+ // resize output
536
+ output = output.view({batch, channels_out, height_out, width_out}).zero_();
537
+ // resize temporary columns
538
+ columns =
539
+ at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
540
+ input.options());
541
+
542
+ output = output.view({output.size(0), group, output.size(1) / group,
543
+ output.size(2), output.size(3)});
544
+
545
+ for (int b = 0; b < batch; b++) {
546
+ modulated_deformable_im2col_cuda(
547
+ input[b], offset[b], mask[b], 1, channels, height, width, height_out,
548
+ width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
549
+ dilation_h, dilation_w, deformable_group, columns);
550
+
551
+ // divide into group
552
+ weight = weight.view({group, weight.size(0) / group, weight.size(1),
553
+ weight.size(2), weight.size(3)});
554
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
555
+
556
+ for (int g = 0; g < group; g++) {
557
+ output[b][g] = output[b][g]
558
+ .flatten(1)
559
+ .addmm_(weight[g].flatten(1), columns[g])
560
+ .view_as(output[b][g]);
561
+ }
562
+
563
+ weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
564
+ weight.size(3), weight.size(4)});
565
+ columns =
566
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
567
+ }
568
+
569
+ output = output.view({output.size(0), output.size(1) * output.size(2),
570
+ output.size(3), output.size(4)});
571
+
572
+ if (with_bias) {
573
+ output += bias.view({1, bias.size(0), 1, 1});
574
+ }
575
+ }
576
+
577
+ void modulated_deform_conv_cuda_backward(
578
+ at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
579
+ at::Tensor offset, at::Tensor mask, at::Tensor columns,
580
+ at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias,
581
+ at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output,
582
+ int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
583
+ int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
584
+ const bool with_bias)
585
+ {
586
+ TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
587
+ TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
588
+
589
+ const int batch = input.size(0);
590
+ const int channels = input.size(1);
591
+ const int height = input.size(2);
592
+ const int width = input.size(3);
593
+
594
+ const int channels_kernel = weight.size(1);
595
+ const int kernel_h_ = weight.size(2);
596
+ const int kernel_w_ = weight.size(3);
597
+ if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
598
+ AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
599
+ kernel_h_, kernel_w, kernel_h_, kernel_w_);
600
+ if (channels != channels_kernel * group)
601
+ AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
602
+ channels, channels_kernel * group);
603
+
604
+ const int height_out =
605
+ (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
606
+ const int width_out =
607
+ (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
608
+
609
+ if (ones.ndimension() != 2 ||
610
+ ones.size(0) * ones.size(1) < height_out * width_out) {
611
+ // Resize plane and fill with ones...
612
+ ones = at::ones({height_out, width_out}, input.options());
613
+ }
614
+
615
+ grad_input = grad_input.view({batch, channels, height, width});
616
+ columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
617
+ input.options());
618
+
619
+ grad_output =
620
+ grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
621
+ grad_output.size(2), grad_output.size(3)});
622
+
623
+ for (int b = 0; b < batch; b++) {
624
+ // divide int group
625
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
626
+ weight = weight.view({group, weight.size(0) / group, weight.size(1),
627
+ weight.size(2), weight.size(3)});
628
+
629
+ for (int g = 0; g < group; g++) {
630
+ columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
631
+ grad_output[b][g].flatten(1), 0.0f, 1.0f);
632
+ }
633
+
634
+ columns =
635
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
636
+ weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
637
+ weight.size(3), weight.size(4)});
638
+
639
+ // gradient w.r.t. input coordinate data
640
+ modulated_deformable_col2im_coord_cuda(
641
+ columns, input[b], offset[b], mask[b], 1, channels, height, width,
642
+ height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
643
+ stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
644
+ grad_mask[b]);
645
+ // gradient w.r.t. input data
646
+ modulated_deformable_col2im_cuda(
647
+ columns, offset[b], mask[b], 1, channels, height, width, height_out,
648
+ width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
649
+ dilation_h, dilation_w, deformable_group, grad_input[b]);
650
+
651
+ // gradient w.r.t. weight, dWeight should accumulate across the batch and
652
+ // group
653
+ modulated_deformable_im2col_cuda(
654
+ input[b], offset[b], mask[b], 1, channels, height, width, height_out,
655
+ width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
656
+ dilation_h, dilation_w, deformable_group, columns);
657
+
658
+ columns = columns.view({group, columns.size(0) / group, columns.size(1)});
659
+ grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
660
+ grad_weight.size(1), grad_weight.size(2),
661
+ grad_weight.size(3)});
662
+ if (with_bias)
663
+ grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
664
+
665
+ for (int g = 0; g < group; g++) {
666
+ grad_weight[g] =
667
+ grad_weight[g]
668
+ .flatten(1)
669
+ .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
670
+ .view_as(grad_weight[g]);
671
+ if (with_bias) {
672
+ grad_bias[g] =
673
+ grad_bias[g]
674
+ .view({-1, 1})
675
+ .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
676
+ .view(-1);
677
+ }
678
+ }
679
+
680
+ columns =
681
+ columns.view({columns.size(0) * columns.size(1), columns.size(2)});
682
+ grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
683
+ grad_weight.size(2), grad_weight.size(3),
684
+ grad_weight.size(4)});
685
+ if (with_bias)
686
+ grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
687
+ }
688
+ grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
689
+ grad_output.size(2), grad_output.size(3),
690
+ grad_output.size(4)});
691
+ }