File size: 3,865 Bytes
749745d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# for final GLIP tiny, pre-trained from scratch
MODEL:
  META_ARCHITECTURE: "GeneralizedVLRCNN"
  WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
  RPN_ONLY: True
  RPN_ARCHITECTURE: "VLDYHEAD"

  BACKBONE:
    CONV_BODY: "SWINT-FPN-RETINANET"
    OUT_CHANNELS: 256

  LANGUAGE_BACKBONE:
    FREEZE: False
    MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
    MASK_SPECIAL: False

  RPN:
    USE_FPN: True
    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
    ASPECT_RATIOS: (1.0,)
    SCALES_PER_OCTAVE: 1

  DYHEAD:
    CHANNELS: 256
    NUM_CONVS: 6
    USE_GN: True
    USE_DYRELU: True
    USE_DFCONV: True
    USE_DYFUSE: True
    TOPK: 9 # topk for selecting candidate positive samples from each level
    SCORE_AGG: "MEAN"
    LOG_SCALE: 0.0
    USE_CHECKPOINT: True
    FUSE_CONFIG:
      EARLY_FUSE_ON: True
      TYPE: "MHA-B"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
      USE_CLASSIFICATION_LOSS: False
      USE_TOKEN_LOSS: False
      USE_CONTRASTIVE_ALIGN_LOSS: False
      CONTRASTIVE_HIDDEN_DIM: 64
      USE_DOT_PRODUCT_TOKEN_LOSS: True
      USE_FUSED_FEATURES_DOT_PRODUCT: True
      USE_LAYER_SCALE: True
      CLAMP_MIN_FOR_UNDERFLOW: True
      CLAMP_MAX_FOR_OVERFLOW: True
      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
      CLAMP_DOT_PRODUCT: True

# use for grounding model
DATASETS:
  REGISTER:
    bing_caption_train:
      yaml_path: "GCC/CC3M/yamls"
      yaml_name: "tiny.noun.harsh"
      yaml_name_no_coco: "tiny.noun.harsh"
    mixed_train_no_coco_noun_gpt_0425:
      coco_img_dir: "coco/train2014"
      vg_img_dir: "gqa/images"
      ann_file: "mdetr_annotations/final_mixed_train_no_coco_with_nouns_gpt.0425.json"
    flickr30k_train_gpt_0425:
      img_folder: "flickr30k/flickr30k_images/train"
      ann_file: "mdetr_annotations/final_flickr_separateGT_train_gpt.0425.json"
      is_train: True

  CAPTION_CONF: 0.4

  CAPTION_AUGMENTATION_VERSION: "mixed.v4-v3.5-4-1.drop_positive.control_pos.grouping.v1" # for GoldG data; used by CaptionAugmentation to determine how to perform the augmentation
  OD_TO_GROUNDING_VERSION: "description.gpt.v10.mixed.allow_zero.v1" # for
  CC_CAPTION_AUGMENTATION_VERSION: "mixed.v3.8-2.drop_positive.control_pos.grouping.v1" # for CC data; used by CaptionAugmentation to determine how to perform the augmentation
  CAPTION_VOCAB_FILE: "tools/files/mixed_vocab.v1.tmp0.davincci.chunk1of1.filtered.json"
  DESCRIPTION_FILE: "tools/files/o365.description.v1.json"

  TRAIN:  ("mixed_train_no_coco_noun_gpt_0425", "flickr30k_train_gpt_0425", "object365_dt_train", ) # bing_caption_train_no_coco
  TEST: ("coco_2017_val", )
  BING_INDEX_LIST: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
  # BING_INDEX_LIST: [ 0, 1, ]
  ONE_HOT: False
  FLICKR_COPY: 2
  MIXED_COPY: 2
  OBJECT365_COPY: 1
  DISABLE_SHUFFLE: False
  ADD_DET_PROMPT: False
  RANDOM_SAMPLE_NEG: 85
  CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
  FURTHER_SCREEN: True

  CAPTION_NMS: -1.0
  CAPTION_MIN_BOX: 1

  SEPARATION_TOKENS: ". "

  PACK_RANDOM_CAPTION_NUMBER: 20
  NO_RANDOM_PACK_PROBABILITY: 0.4
  RANDOM_PACK_PROB: 0.5
  CAPTION_FORMAT_VERSION: "v2"


INPUT:
  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
  MIN_SIZE_TRAIN: 800
  MAX_SIZE_TRAIN: 1333
  MIN_SIZE_TEST: 800
  MAX_SIZE_TEST: 1333

AUGMENT:
  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)

DATALOADER:
  SIZE_DIVISIBILITY: 32
  DISTRIBUTE_CHUNK_AMONG_NODE: False

SOLVER:
  OPTIMIZER: ADAMW
  BASE_LR: 0.0001
  LANG_LR: 0.00001
  WEIGHT_DECAY: 0.0001
  STEPS: (0.67, 0.89)
  #MAX_EPOCH: 12
  MAX_ITER: 300000
  IMS_PER_BATCH: 64
  WARMUP_ITERS: 2000
  WARMUP_FACTOR: 0.001
  USE_AMP: True
  MODEL_EMA: 0.999
  FIND_UNUSED_PARAMETERS: False

  CLIP_GRADIENTS:
    ENABLED: True
    CLIP_TYPE: "full_model"
    CLIP_VALUE: 1.0
    NORM_TYPE: 2.0