SAM-CAT-Seg / configs /config.yaml
seokju cho
initial commit
f8f62f3
MODEL:
META_ARCHITECTURE: "CATSeg"
BACKBONE:
FREEZE_AT: 0
NAME: "D2SwinTransformer"
SWIN:
EMBED_DIM: 192
DEPTHS: [2, 2, 18, 2]
NUM_HEADS: [4, 8, 16, 32]
WINDOW_SIZE: 12
APE: False
DROP_PATH_RATE: 0.3
PATCH_NORM: True
PRETRAIN_IMG_SIZE: 384
OUT_FEATURES: ["res2", "res3", "res4"]
WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
SEM_SEG_HEAD:
NAME: "OpenVocabHead"
IN_FEATURES: ["res2", "res3", "res4"]
IGNORE_VALUE: 255
NUM_CLASSES: 171
TRAIN_CLASS_JSON: "datasets/coco.json"
TEST_CLASS_JSON: "datasets/coco.json"
CLIP_PRETRAINED: "ViT-L/14@336px"
PROMPT_DEPTH: 0
PROMPT_LENGTH: 0
TEXT_AFFINITY_DIM: 768
TEXT_AFFINITY_PROJ_DIM: 128
APPEARANCE_AFFINITY_DIM: 768
APPEARANCE_AFFINITY_PROJ_DIM: 128
DECODER_DIMS: [64, 32]
DECODER_AFFINITY_DIMS: [256, 128]
DECODER_AFFINITY_PROJ_DIMS: [32, 16]
NUM_LAYERS: 4
NUM_HEADS: 4
HIDDEN_DIMS: 128
POOLING_SIZES: [6, 6]
FEATURE_RESOLUTION: [24, 24]
WINDOW_SIZES: 12
ATTENTION_TYPE: "linear"
CLIP_FINETUNE: "attention"
PROMPT_ENSEMBLE_TYPE: "imagenet"
DATASETS:
TRAIN: ("coco_2017_train_stuff_all_sem_seg",)
TEST: ("coco_2017_test_stuff_all_sem_seg",)
SOLVER:
IMS_PER_BATCH: 4
BASE_LR: 0.0002
MAX_ITER: 80000
WARMUP_FACTOR: 1.0
WARMUP_ITERS: 0
WEIGHT_DECAY: 0.0001
OPTIMIZER: "ADAMW"
LR_SCHEDULER_NAME: "WarmupCosineLR"
BACKBONE_MULTIPLIER: 0.01
CLIP_MULTIPLIER: 0.01
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 0.01
NORM_TYPE: 2.0
INPUT:
MIN_SIZE_TRAIN: (384, )
MIN_SIZE_TRAIN_SAMPLING: "choice"
MIN_SIZE_TEST: 640
MAX_SIZE_TEST: 2560
CROP:
ENABLED: True
TYPE: "absolute"
SIZE: (384, 384)
SINGLE_CATEGORY_MAX_AREA: 1.0
COLOR_AUG_SSD: True
SIZE_DIVISIBILITY: 384
FORMAT: "RGB"
DATASET_MAPPER_NAME: "mask_former_semantic"
TEST:
EVAL_PERIOD: 5000
SLIDING_WINDOW: False
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: True
NUM_WORKERS: 8
VERSION: 2
CUDNN_BENCHMARK: True