File size: 2,640 Bytes
749745d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
MODEL:
  META_ARCHITECTURE: "GeneralizedVLRCNN"
  WEIGHT: "swin_base_patch4_window12_384_22k.pth"
  RPN_ONLY: True
  RPN_ARCHITECTURE: "VLDYHEAD"
  ATSS:
    PRE_NMS_TOP_N: 3000
    DETECTIONS_PER_IMG: 100
    INFERENCE_TH: 0.0

  SWINT:
    VERSION: "fusion"
    EMBED_DIM: 128
    DEPTHS: (2, 2, 18, 2)
    NUM_HEADS: (4, 8, 16, 32)
    WINDOW_SIZE: 12
    OUT_CHANNELS: (128, 256, 512, 1024)
    DROP_PATH_RATE: 0.4

  BACKBONE:
    FUSION_VERSION: "v3"
    CONV_BODY: "SWINT-FPN-RETINANET"
    OUT_CHANNELS: 256
    USE_CHECKPOINT: True
    FREEZE_CONV_BODY_AT: -1

  LANGUAGE_BACKBONE:
    FREEZE: False
    MODEL_TYPE: "roberta-fused-v2"
    TOKENIZER_TYPE: "roberta-base"
    LANG_DIM: 768
    MASK_SPECIAL: False
    USE_CHECKPOINT: False

  RPN:
    USE_FPN: True
    ANCHOR_SIZES: (64, 128, 256, 512, 1024)
    ANCHOR_STRIDE: (8, 16, 32, 64, 128)
    ASPECT_RATIOS: (1.0,)
    SCALES_PER_OCTAVE: 1

  DYHEAD:
    CHANNELS: 256
    NUM_CONVS: 6
    USE_GN: True
    USE_DYRELU: True
    USE_DFCONV: True
    USE_DYFUSE: True
    TOPK: 9
    SCORE_AGG: "MEAN"
    LOG_SCALE: 0.0
    USE_CHECKPOINT: True

    FUSE_CONFIG:
      EARLY_FUSE_ON: False
      TYPE: "NONE"   # "MHA-B", "MHA-S", "FILM", "SCAN", "NONE"
      USE_CLASSIFICATION_LOSS: False
      USE_TOKEN_LOSS: False
      USE_CONTRASTIVE_ALIGN_LOSS: False
      CONTRASTIVE_HIDDEN_DIM: 64
      USE_DOT_PRODUCT_TOKEN_LOSS: True
      USE_LAYER_SCALE: True
      CLAMP_MIN_FOR_UNDERFLOW: True
      CLAMP_MAX_FOR_OVERFLOW: True
      CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
      CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
      CLAMP_DOT_PRODUCT: True

# use for grounding model
DATASETS:
  TRAIN: ("refcoco_train", )
  TEST: ("refcoco_val", )
  DISABLE_SHUFFLE: True

INPUT:
  PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
  PIXEL_STD: [ 57.375, 57.120, 58.395 ]
  MIN_SIZE_TRAIN: 800
  MAX_SIZE_TRAIN: 1333
  MIN_SIZE_TEST: 800
  MAX_SIZE_TEST: 1333

AUGMENT:
  MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
  FLIP_PROB_TRAIN: 0.0 # Important for refcoco esp

DATALOADER:
  SIZE_DIVISIBILITY: 32

SOLVER:
  OPTIMIZER: ADAMW
  BASE_LR: 0.00001
  LANG_LR: 0.00001
  WEIGHT_DECAY: 0.0001
  STEPS: (0.67, 0.89)
  MAX_EPOCH: 20
  IMS_PER_BATCH: 16
  WARMUP_ITERS: 2000
  WARMUP_FACTOR: 0.001
  TEST_WITH_INFERENCE: True
  FIND_UNUSED_PARAMETERS: False
  USE_AMP: True
  MODEL_EMA: 0.999

  CLIP_GRADIENTS:
    ENABLED: False
    CLIP_TYPE: "full_model"
    CLIP_VALUE: 1.0
    NORM_TYPE: 2.0

TEST:
  DURING_TRAINING: True
  EVAL_TASK: "grounding"
  IMS_PER_BATCH: 16