File size: 3,165 Bytes
749745d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
MODEL:
  META_ARCHITECTURE: "GeneralizedVLRCNN"
  WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
  RPN_ONLY: True
  RPN_ARCHITECTURE: "VLDYHEAD"

  BACKBONE:
    CONV_BODY: "SWINT-FPN-RETINANET"
    OUT_CHANNELS: 256

  LANGUAGE_BACKBONE:
    FREEZE: False
    MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
    MASK_SPECIAL: False

  RPN:
    USE_FPN: True
    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
    ASPECT_RATIOS: (1.0,)
    SCALES_PER_OCTAVE: 1

  DYHEAD:
    CHANNELS: 256
    NUM_CONVS: 6
    USE_GN: True
    USE_DYRELU: True
    USE_DFCONV: True
    USE_DYFUSE: True
    TOPK: 9 # topk for selecting candidate positive samples from each level
    SCORE_AGG: "MEAN"
    LOG_SCALE: 0.0
    
    FUSE_CONFIG:
      EARLY_FUSE_ON: True
      TYPE: "MHA-B"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
      USE_CLASSIFICATION_LOSS: False
      USE_TOKEN_LOSS: False
      USE_CONTRASTIVE_ALIGN_LOSS: False
      CONTRASTIVE_HIDDEN_DIM: 64
      USE_DOT_PRODUCT_TOKEN_LOSS: True
      USE_FUSED_FEATURES_DOT_PRODUCT: True
      USE_LAYER_SCALE: True
      CLAMP_MIN_FOR_UNDERFLOW: True
      CLAMP_MAX_FOR_OVERFLOW: True
      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
      CLAMP_DOT_PRODUCT: True

# use for grounding model
DATASETS:
  REGISTER:
    bing_caption_train:
      yaml_path: "GCC/CC3M/yamls"
      yaml_name: "tiny.noun.harsh"
      yaml_name_no_coco: "tiny.noun.harsh"

  # PREDOWNLOAD_BING : True
  # PREDOWNLOAD_WITH_AZCOPY : True

  CAPTION_CONF: 0.4
  CAPTION_AUGMENTATION_VERSION: "v3.v1"
  CAPTION_VOCAB_FILE: "tools/files/mixed_vocab.v1.tmp0.davincci.chunk1of1.filtered.json"
  DESCRIPTION_FILE: "tools/files/o365.description.v1.json"

  TRAIN:  ("mixed_train_no_coco", "flickr30k_train", "object365_dt_train", "bing_caption_train_no_coco")
  #  TRAIN: ("bing_caption_train", "mixed_train", "flickr30k_train", "coco_grounding_train", )
  TEST: ("coco_2017_val", )
  BING_INDEX_LIST: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
  # BING_INDEX_LIST: [ 0, 1, ]
  ONE_HOT: False
  FLICKR_COPY: 2
  MIXED_COPY: 2
  OBJECT365_COPY: 2
  DISABLE_SHUFFLE: False
  ADD_DET_PROMPT: False
  RANDOM_SAMPLE_NEG: 85
  CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
  FURTHER_SCREEN: True
  
  CAPTION_NMS: -1.0
  CAPTION_MIN_BOX: 1

  SEPARATION_TOKENS: ". "

  PACK_RANDOM_CAPTION_NUMBER: 20
  NO_RANDOM_PACK_PROBABILITY: 0.4
  RANDOM_PACK_PROB: 0.5
  CAPTION_FORMAT_VERSION: "v2"


INPUT:
  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
  MIN_SIZE_TRAIN: 800
  MAX_SIZE_TRAIN: 1333
  MIN_SIZE_TEST: 800
  MAX_SIZE_TEST: 1333

AUGMENT:
  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)

DATALOADER:
  SIZE_DIVISIBILITY: 32
  DISTRIBUTE_CHUNK_AMONG_NODE: False

SOLVER:
  OPTIMIZER: ADAMW
  BASE_LR: 0.0001
  LANG_LR: 0.00001
  WEIGHT_DECAY: 0.0001
  STEPS: (0.67, 0.89)
  #MAX_EPOCH: 12
  MAX_ITER: 235026
  IMS_PER_BATCH: 64
  WARMUP_ITERS: 2000
  WARMUP_FACTOR: 0.001
  USE_AMP: True
  MODEL_EMA: 0.999
  FIND_UNUSED_PARAMETERS: False

  CLIP_GRADIENTS:
    ENABLED: True
    CLIP_TYPE: "full_model"
    CLIP_VALUE: 1.0
    NORM_TYPE: 2.0