File size: 2,130 Bytes
f8f62f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
MODEL:
  META_ARCHITECTURE: "CATSeg"
  BACKBONE:
    FREEZE_AT: 0
    NAME: "D2SwinTransformer"
  SWIN:
    EMBED_DIM: 192
    DEPTHS: [2, 2, 18, 2]
    NUM_HEADS: [4, 8, 16, 32]
    WINDOW_SIZE: 12
    APE: False
    DROP_PATH_RATE: 0.3
    PATCH_NORM: True
    PRETRAIN_IMG_SIZE: 384
    OUT_FEATURES: ["res2", "res3", "res4"]
  WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
  PIXEL_MEAN: [123.675, 116.280, 103.530]
  PIXEL_STD: [58.395, 57.120, 57.375]
  SEM_SEG_HEAD:
    NAME: "OpenVocabHead"
    IN_FEATURES: ["res2", "res3", "res4"]
    IGNORE_VALUE: 255
    NUM_CLASSES: 171
    TRAIN_CLASS_JSON: "datasets/coco.json"
    TEST_CLASS_JSON: "datasets/coco.json"
    CLIP_PRETRAINED: "ViT-L/14@336px"
    PROMPT_DEPTH: 0
    PROMPT_LENGTH: 0
    TEXT_AFFINITY_DIM: 768
    TEXT_AFFINITY_PROJ_DIM: 128
    APPEARANCE_AFFINITY_DIM: 768
    APPEARANCE_AFFINITY_PROJ_DIM: 128
    DECODER_DIMS: [64, 32]
    DECODER_AFFINITY_DIMS: [256, 128]
    DECODER_AFFINITY_PROJ_DIMS: [32, 16]
    NUM_LAYERS: 4
    NUM_HEADS: 4
    HIDDEN_DIMS: 128
    POOLING_SIZES: [6, 6]
    FEATURE_RESOLUTION: [24, 24]
    WINDOW_SIZES: 12
    ATTENTION_TYPE: "linear"
    CLIP_FINETUNE: "attention"
  PROMPT_ENSEMBLE_TYPE: "imagenet"
DATASETS:
  TRAIN: ("coco_2017_train_stuff_all_sem_seg",)
  TEST: ("coco_2017_test_stuff_all_sem_seg",)
SOLVER:
  IMS_PER_BATCH: 4
  BASE_LR: 0.0002
  MAX_ITER: 80000
  WARMUP_FACTOR: 1.0
  WARMUP_ITERS: 0
  WEIGHT_DECAY: 0.0001
  OPTIMIZER: "ADAMW"
  LR_SCHEDULER_NAME: "WarmupCosineLR"
  BACKBONE_MULTIPLIER: 0.01
  CLIP_MULTIPLIER: 0.01
  CLIP_GRADIENTS:
    ENABLED: True
    CLIP_TYPE: "full_model"
    CLIP_VALUE: 0.01
    NORM_TYPE: 2.0
INPUT:
  MIN_SIZE_TRAIN: (384, )
  MIN_SIZE_TRAIN_SAMPLING: "choice"
  MIN_SIZE_TEST: 640
  MAX_SIZE_TEST: 2560
  CROP:
    ENABLED: True
    TYPE: "absolute"
    SIZE: (384, 384)
    SINGLE_CATEGORY_MAX_AREA: 1.0
  COLOR_AUG_SSD: True
  SIZE_DIVISIBILITY: 384
  FORMAT: "RGB"
  DATASET_MAPPER_NAME: "mask_former_semantic"
TEST:
  EVAL_PERIOD: 5000
  SLIDING_WINDOW: False
DATALOADER:
  FILTER_EMPTY_ANNOTATIONS: True
  NUM_WORKERS: 8
VERSION: 2
CUDNN_BENCHMARK: True