MODEL: META_ARCHITECTURE: "CATSeg" BACKBONE: FREEZE_AT: 0 NAME: "D2SwinTransformer" SWIN: EMBED_DIM: 192 DEPTHS: [2, 2, 18, 2] NUM_HEADS: [4, 8, 16, 32] WINDOW_SIZE: 12 APE: False DROP_PATH_RATE: 0.3 PATCH_NORM: True PRETRAIN_IMG_SIZE: 384 OUT_FEATURES: ["res2", "res3", "res4"] WEIGHTS: "swin_large_patch4_window12_384_22k.pkl" PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] SEM_SEG_HEAD: NAME: "OpenVocabHead" IN_FEATURES: ["res2", "res3", "res4"] IGNORE_VALUE: 255 NUM_CLASSES: 171 TRAIN_CLASS_JSON: "datasets/coco.json" TEST_CLASS_JSON: "datasets/coco.json" CLIP_PRETRAINED: "ViT-L/14@336px" PROMPT_DEPTH: 0 PROMPT_LENGTH: 0 TEXT_AFFINITY_DIM: 768 TEXT_AFFINITY_PROJ_DIM: 128 APPEARANCE_AFFINITY_DIM: 768 APPEARANCE_AFFINITY_PROJ_DIM: 128 DECODER_DIMS: [64, 32] DECODER_AFFINITY_DIMS: [256, 128] DECODER_AFFINITY_PROJ_DIMS: [32, 16] NUM_LAYERS: 4 NUM_HEADS: 4 HIDDEN_DIMS: 128 POOLING_SIZES: [6, 6] FEATURE_RESOLUTION: [24, 24] WINDOW_SIZES: 12 ATTENTION_TYPE: "linear" CLIP_FINETUNE: "attention" PROMPT_ENSEMBLE_TYPE: "imagenet" DATASETS: TRAIN: ("coco_2017_train_stuff_all_sem_seg",) TEST: ("coco_2017_test_stuff_all_sem_seg",) SOLVER: IMS_PER_BATCH: 4 BASE_LR: 0.0002 MAX_ITER: 80000 WARMUP_FACTOR: 1.0 WARMUP_ITERS: 0 WEIGHT_DECAY: 0.0001 OPTIMIZER: "ADAMW" LR_SCHEDULER_NAME: "WarmupCosineLR" BACKBONE_MULTIPLIER: 0.01 CLIP_MULTIPLIER: 0.01 CLIP_GRADIENTS: ENABLED: True CLIP_TYPE: "full_model" CLIP_VALUE: 0.01 NORM_TYPE: 2.0 INPUT: MIN_SIZE_TRAIN: (384, ) MIN_SIZE_TRAIN_SAMPLING: "choice" MIN_SIZE_TEST: 640 MAX_SIZE_TEST: 2560 CROP: ENABLED: True TYPE: "absolute" SIZE: (384, 384) SINGLE_CATEGORY_MAX_AREA: 1.0 COLOR_AUG_SSD: True SIZE_DIVISIBILITY: 384 FORMAT: "RGB" DATASET_MAPPER_NAME: "mask_former_semantic" TEST: EVAL_PERIOD: 5000 SLIDING_WINDOW: False DATALOADER: FILTER_EMPTY_ANNOTATIONS: True NUM_WORKERS: 8 VERSION: 2 CUDNN_BENCHMARK: True