fffiloni commited on
Commit
f88d9af
1 Parent(s): 148bb44

Create configs/xdecoder/svlp_focalt_lang.yaml

Browse files
configs/xdecoder/svlp_focalt_lang.yaml ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --------------------------------------------------------
2
+ # X-Decoder -- Generalized Decoding for Pixel, Image, and Language
3
+ # Copyright (c) 2022 Microsoft
4
+ # Licensed under The MIT License [see LICENSE for details]
5
+ # Written by Xueyan Zou (xueyan@cs.wisc.edu)
6
+ # --------------------------------------------------------
7
+
8
+ ##################
9
+ # Task settings
10
+ ##################
11
+ VERBOSE: true
12
+ MODEL:
13
+ NAME: xdecoder_model
14
+ HEAD: xdecoder_head
15
+ DIM_PROJ: 512
16
+ BACKBONE_DIM: 768
17
+ TEXT:
18
+ ARCH: vlpencoder
19
+ NAME: transformer
20
+ TOKENIZER: clip
21
+ CONTEXT_LENGTH: 77 # 77
22
+ WIDTH: 512
23
+ HEADS: 8
24
+ LAYERS: 12 # 6
25
+ AUTOGRESSIVE: True
26
+ BACKBONE:
27
+ NAME: focal_dw
28
+ PRETRAINED: ''
29
+ LOAD_PRETRAINED: false
30
+ FOCAL:
31
+ PRETRAIN_IMG_SIZE: 224
32
+ PATCH_SIZE: 4
33
+ EMBED_DIM: 96
34
+ DEPTHS: [2, 2, 6, 2]
35
+ FOCAL_LEVELS: [3, 3, 3, 3]
36
+ FOCAL_WINDOWS: [3, 3, 3, 3]
37
+ DROP_PATH_RATE: 0.3
38
+ MLP_RATIO: 4.0
39
+ DROP_RATE: 0.0
40
+ PATCH_NORM: True
41
+ USE_CONV_EMBED: True
42
+ SCALING_MODULATOR: True
43
+ USE_CHECKPOINT: False
44
+ USE_POSTLN: true
45
+ USE_POSTLN_IN_MODULATION: false
46
+ USE_LAYERSCALE: True
47
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
48
+ OUT_INDICES: [0, 1, 2, 3]
49
+ ENCODER:
50
+ NAME: transformer_encoder_fpn
51
+ IGNORE_VALUE: 255
52
+ NUM_CLASSES: 133
53
+ LOSS_WEIGHT: 1.0
54
+ CONVS_DIM: 512
55
+ MASK_DIM: 512
56
+ NORM: "GN"
57
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
58
+ DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
59
+ COMMON_STRIDE: 4
60
+ TRANSFORMER_ENC_LAYERS: 6
61
+ DECODER:
62
+ NAME: xdecoder
63
+ TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
64
+ MASK: True
65
+ GROUNDING:
66
+ ENABLED: True
67
+ MAX_LEN: 5
68
+ TEXT_WEIGHT: 2.0
69
+ CLASS_WEIGHT: 0.5
70
+ DETECTION: False
71
+ CAPTION:
72
+ ENABLED: True
73
+ PHRASE_PROB: 0.0
74
+ SIM_THRES: 0.95
75
+ CAPTIONING:
76
+ ENABLED: True
77
+ STEP: 50
78
+ RETRIEVAL:
79
+ ENABLED: True
80
+ DIM_IMG: 768
81
+ ENSEMBLE: True
82
+ HIDDEN_DIM: 512
83
+ NUM_OBJECT_QUERIES: 101
84
+ NHEADS: 8
85
+ DROPOUT: 0.0
86
+ DIM_FEEDFORWARD: 2048
87
+ PRE_NORM: False
88
+ ENFORCE_INPUT_PROJ: False
89
+ SIZE_DIVISIBILITY: 32
90
+ TRAIN_NUM_POINTS: 12544
91
+ OVERSAMPLE_RATIO: 3.0
92
+ IMPORTANCE_SAMPLE_RATIO: 0.75
93
+ DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
94
+ TOP_GROUNDING_LAYERS: 3
95
+ TOP_CAPTION_LAYERS: 3
96
+ TOP_CAPTIONING_LAYERS: 3
97
+ TOP_RETRIEVAL_LAYERS: 3
98
+ TOP_OPENIMAGE_LAYERS: 10
99
+ TEST:
100
+ SEMANTIC_ON: True
101
+ INSTANCE_ON: True
102
+ PANOPTIC_ON: True
103
+ OVERLAP_THRESHOLD: 0.8
104
+ OBJECT_MASK_THRESHOLD: 0.4
105
+ SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
106
+ DETECTIONS_PER_IMAGE: 100
107
+
108
+ INPUT:
109
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
110
+ PIXEL_STD: [58.395, 57.120, 57.375]