File size: 4,746 Bytes
749745d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
MODEL:
  META_ARCHITECTURE: "GeneralizedVLRCNN"
  WEIGHT: "MODEL/swin_base_patch4_window7_224.pth"
  RPN_ONLY: True
  RPN_ARCHITECTURE: "VLDYHEAD"

  BACKBONE:
    FUSION_VERSION: "v2"
    CONV_BODY: "SWINT-FPN-RETINANET"
    OUT_CHANNELS: 256

  SWINT:
      VERSION: "fusion"
      EMBED_DIM: 128
      DEPTHS: (2, 2, 18, 2)
      NUM_HEADS: (4, 8, 16, 32)
      WINDOW_SIZE: 12
      OUT_CHANNELS: (128, 256, 512, 1024)
      DROP_PATH_RATE: 0.4

  LANGUAGE_BACKBONE:
    FREEZE: False
    MODEL_TYPE: "roberta-fused-v2"
    MASK_SPECIAL: False
    TOKENIZER_TYPE: "roberta-base"
    USE_CHECKPOINT: False

  RPN:
    USE_FPN: True
    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
    ASPECT_RATIOS: (1.0,)
    SCALES_PER_OCTAVE: 1

  DYHEAD:
    CHANNELS: 256
    NUM_CONVS: 6
    USE_GN: True
    USE_DYRELU: True
    USE_DFCONV: True
    USE_DYFUSE: True
    TOPK: 9 # topk for selecting candidate positive samples from each level
    SCORE_AGG: "MEAN"
    LOG_SCALE: 0.0

    USE_CHECKPOINT: True
    FUSE_CONFIG:
      USE_FUSED_FEATURES_DOT_PRODUCT: False
      EARLY_FUSE_ON: False
      TYPE: "NONE"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
      USE_CLASSIFICATION_LOSS: False
      USE_TOKEN_LOSS: False
      USE_CONTRASTIVE_ALIGN_LOSS: False
      CONTRASTIVE_HIDDEN_DIM: 64
      USE_DOT_PRODUCT_TOKEN_LOSS: True
      USE_LAYER_SCALE: True
      CLAMP_MIN_FOR_UNDERFLOW: True
      CLAMP_MAX_FOR_OVERFLOW: True
      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
      CLAMP_DOT_PRODUCT: True

DATASETS:
  REGISTER:
    bing_caption_train:
      yaml_path: "GCC/CC3M/yamls"
      yaml_name: "tiny.noun.harsh"
      yaml_name_no_coco: "tiny.noun.harsh"
    mixed_train_no_coco_noun:
      coco_img_dir: "coco/train2014"
      vg_img_dir: "gqa/images"
      ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns.json"
    mixed_train_no_coco_gpt:
      coco_img_dir: "coco/train2014"
      vg_img_dir: "gqa/images"
      ann_file: "mdetr_annotations/final_mixed_train_no_coco_gpt.v1.new.json"
    flickr30k_train_gpt:
      img_folder: "flickr30k/flickr30k_images/train"
      ann_file: "mdetr_annotations/final_flickr_separateGT_train_gpt.v1.json"
      is_train: True
    mixed_train_no_coco_noun_gpt:
      coco_img_dir: "coco/train2014"
      vg_img_dir: "gqa/images"
      ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.v1.json"
    mixed_train_no_coco_noun_gpt_0425:
      coco_img_dir: "coco/train2014"
      vg_img_dir: "gqa/images"
      ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.0425.json"
    flickr30k_train_gpt_0425:
      img_folder: "flickr30k/flickr30k_images/train"
      ann_file: "mdetr_annotations/final_flickr_separateGT_train_gpt.0425.json"
      is_train: True

  CAPTION_CONF: 0.4
  OD_TO_GROUNDING_VERSION: "description.gpt.v10.mixed.allow_zero.v1"
  CAPTION_AUGMENTATION_VERSION: "mixed.v4.8-2.drop_positive.control_pos.grouping.v1"
  CC_CAPTION_AUGMENTATION_VERSION: "mixed.v3-v4.9-1.drop_positive.control_pos.grouping.v1"
  CAPTION_VOCAB_FILE: "tools/files/joint_vocab.merged.v1.tmp0.davincci.json"
  DESCRIPTION_FILE: "tools/files/o365.description.v1.json"

  TRAIN:  ("mixed_train_no_coco_noun_gpt_0425", "flickr30k_train_gpt_0425", "object365_dt_train", ) # bing_caption_train_no_coco
  TEST: ("coco_2017_val", )
  ADD_DET_PROMPT: False
  ADD_DET_PROMPT_ADVANCED: False
  ALTERNATIVE_TRAINING: False
  BING_INDEX_LIST: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
  ONE_HOT: False
  FLICKR_COPY: 2
  MIXED_COPY: 2
  OBJECT365_COPY: 2
  DISABLE_SHUFFLE: False
  ADD_DET_PROMPT: False
  RANDOM_SAMPLE_NEG: 85
  CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
  FURTHER_SCREEN: True

  CAPTION_NMS: -1.0
  CAPTION_MIN_BOX: 1

  SEPARATION_TOKENS: ". "

  PACK_RANDOM_CAPTION_NUMBER: 20
  NO_RANDOM_PACK_PROBABILITY: 0.4
  RANDOM_PACK_PROB: 0.5
  CAPTION_FORMAT_VERSION: "v2"

INPUT:
  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
  MIN_SIZE_TRAIN: 800
  MAX_SIZE_TRAIN: 1333
  MIN_SIZE_TEST: 800
  MAX_SIZE_TEST: 1333

AUGMENT:
  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)

DATALOADER:
  SIZE_DIVISIBILITY: 32
  DISTRIBUTE_CHUNK_AMONG_NODE: False

SOLVER:
  OPTIMIZER: ADAMW
  BASE_LR: 0.0001
  LANG_LR: 0.00001
  WEIGHT_DECAY: 0.01
  WEIGHT_DECAY_SCHEDULE: True
  STEPS: (0.67, 0.89)
  MAX_ITER: 800000
  IMS_PER_BATCH: 64
  WARMUP_ITERS: 2000
  WARMUP_FACTOR: 0.001
  TEST_WITH_INFERENCE: True
  FIND_UNUSED_PARAMETERS: False
  USE_AMP: True
  MODEL_EMA: 0.999
  CHECKPOINT_PERIOD: 2500


  CLIP_GRADIENTS:
    ENABLED: True
    CLIP_TYPE: "full_model"
    CLIP_VALUE: 1.0
    NORM_TYPE: 2.0

TEST:
  DURING_TRAINING: False
  IMS_PER_BATCH: 64