File size: 3,608 Bytes
749745d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
MODEL:
  META_ARCHITECTURE: "GeneralizedVLRCNN"
  WEIGHT: "MODEL/swin_tiny_patch4_window7_224.pth"
  RPN_ONLY: True
  RPN_ARCHITECTURE: "VLDYHEAD"

  BACKBONE:
    FUSION_VERSION: "v2"
    CONV_BODY: "SWINT-FPN-RETINANET"
    OUT_CHANNELS: 256

  SWINT:
      VERSION: "fusion"

  LANGUAGE_BACKBONE:
    FREEZE: False
    MODEL_TYPE: "roberta-fused-tiny"
    MASK_SPECIAL: False
    TOKENIZER_TYPE: "roberta-base"
    USE_CHECKPOINT: False

  RPN:
    USE_FPN: True
    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
    ASPECT_RATIOS: (1.0,)
    SCALES_PER_OCTAVE: 1

  DYHEAD:
    CHANNELS: 256
    NUM_CONVS: 6
    USE_GN: True
    USE_DYRELU: True
    USE_DFCONV: True
    USE_DYFUSE: True
    TOPK: 9 # topk for selecting candidate positive samples from each level
    SCORE_AGG: "MEAN"
    LOG_SCALE: 0.0

    USE_CHECKPOINT: False
    FUSE_CONFIG:
      USE_FUSED_FEATURES_DOT_PRODUCT: False
      EARLY_FUSE_ON: False
      TYPE: "NONE"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
      USE_CLASSIFICATION_LOSS: False
      USE_TOKEN_LOSS: False
      USE_CONTRASTIVE_ALIGN_LOSS: False
      CONTRASTIVE_HIDDEN_DIM: 64
      USE_DOT_PRODUCT_TOKEN_LOSS: True
      USE_LAYER_SCALE: True
      CLAMP_MIN_FOR_UNDERFLOW: True
      CLAMP_MAX_FOR_OVERFLOW: True
      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
      CLAMP_DOT_PRODUCT: True

DATASETS:
  TRAIN: ("flickr30k_train", )
  TEST: ("coco_2017_val", )
  ADD_DET_PROMPT: False
  ADD_DET_PROMPT_ADVANCED: False
  ALTERNATIVE_TRAINING: False
  BOX_THRESHOLD: 0.1
  CAPTION_CONF: 0.9
  CAPTION_FORMAT_VERSION: "v2"
  CAPTION_MIN_BOX: 1
  CAPTION_NMS: 0.9
  CLASS_AGNOSTIC: False
  CLASS_CONCAT: False
  COCO_COPY: 1
  #CONTROL_PROB: (0.05, 0.05, 0.5, 0.2)
  CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
  DISABLE_CLIP_TO_IMAGE: False
  DISABLE_SHUFFLE: False
  FEW_SHOT: 0
  FLICKR_COPY: 1
  FLICKR_GT_TYPE: "separate"
  FULL_QUESTION_PROB: 0.5
  FURTHER_SCREEN: False
  GENERAL_COPY: -1
  GENERAL_COPY_TEST: -1
  INFERENCE_CAPTION: False
  IN_COPY: 1
  LOCAL_DEBUG: False
  LVIS_COPY: 1
  LVIS_USE_NORMAL_AP: False
  MAX_BOX: -1
  MIXED_COPY: 1
  MULTISTAGE_TRAINING: False
  NEG_QUESTION_PROB: 0.8
  NO_MINUS_ONE_FOR_ONE_HOT: False
  OBJECT365_COPY: 1
  OI_COPY: 1
  ONE_HOT: False
  PACK_RANDOM_CAPTION_NUMBER: 0
  POS_QUESTION_PROB: 0.6
  PREDOWNLOAD_BING: False
  PREDOWNLOAD_WITH_AZCOPY: False
  PROMPT_LIMIT_NEG: -1
  RANDOM_SAMPLE_NEG: 85

  REPLACE_CLEAN_LABEL: False
  SAFEGUARD_POSITIVE_CAPTION: True
  SEPARATION_TOKENS: ". "
  SHUFFLE_SEED: 0
  TEST_DATASETNAME_SUFFIX: ""
  TRAIN_DATASETNAME_SUFFIX: ""
  USE_CAPTION_PROMPT: False
  USE_COCO_FORMAT: False
  USE_CROWD: False
  USE_OD_AUG: False
  USE_OVERRIDE_CATEGORY: False
  USE_SUPRESS_QUERY: False
  VG_COPY: 1

INPUT:
  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
  MIN_SIZE_TRAIN: 800
  MAX_SIZE_TRAIN: 1333
  MIN_SIZE_TEST: 800
  MAX_SIZE_TEST: 1333

AUGMENT:
  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)

DATALOADER:
  SIZE_DIVISIBILITY: 32
  DISTRIBUTE_CHUNK_AMONG_NODE: False

SOLVER:
  OPTIMIZER: ADAMW
  BASE_LR: 0.0001
  LANG_LR: 0.00001
  WEIGHT_DECAY: 0.01
  WEIGHT_DECAY_SCHEDULE: True
  STEPS: (0.67, 0.89)
  MAX_ITER: 800000
  IMS_PER_BATCH: 64
  WARMUP_ITERS: 2000
  WARMUP_FACTOR: 0.001
  TEST_WITH_INFERENCE: True
  FIND_UNUSED_PARAMETERS: True
  USE_AMP: True
  MODEL_EMA: 0.999
  CHECKPOINT_PERIOD: 2500


  CLIP_GRADIENTS:
    ENABLED: True
    CLIP_TYPE: "full_model"
    CLIP_VALUE: 1.0
    NORM_TYPE: 2.0

TEST:
  DURING_TRAINING: False
  IMS_PER_BATCH: 64