MODEL: META_ARCHITECTURE: "OVSegDEMO" BACKBONE: FREEZE_AT: 0 NAME: "D2SwinTransformer" SWIN: EMBED_DIM: 128 DEPTHS: [2, 2, 18, 2] NUM_HEADS: [4, 8, 16, 32] WINDOW_SIZE: 12 APE: False DROP_PATH_RATE: 0.3 PATCH_NORM: True PRETRAIN_IMG_SIZE: 384 WEIGHTS: "./ovseg_swinbase_vitL14_ft_mpt.pth" PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] SEM_SEG_HEAD: NAME: "OpenVocabMaskFormerHead" IN_FEATURES: ["res2", "res3", "res4", "res5"] IGNORE_VALUE: 255 NUM_CLASSES: 171 # number of categories in training set EMBEDDING_DIM: 768 EMBED_LAYERS: 2 COMMON_STRIDE: 4 # not used, hard-coded LOSS_WEIGHT: 1.0 CONVS_DIM: 256 MASK_DIM: 256 NORM: "GN" MASK_FORMER: TRANSFORMER_IN_FEATURE: "res5" DEEP_SUPERVISION: True NO_OBJECT_WEIGHT: 0.1 DICE_WEIGHT: 1.0 MASK_WEIGHT: 20.0 HIDDEN_DIM: 256 NUM_OBJECT_QUERIES: 100 NHEADS: 8 DROPOUT: 0.1 DIM_FEEDFORWARD: 2048 ENC_LAYERS: 0 DEC_LAYERS: 6 PRE_NORM: False CLIP_ADAPTER: TEXT_TEMPLATES: "vild" CLIP_MODEL_NAME: "ViT-L/14" MASK_FILL: "mean" MASK_EXPAND_RATIO: 1.0 MASK_THR: 0.35 # choose the foreground objects MASK_MATTING: False # use soft background, default not used MASK_PROMPT_DEPTH: 3 MASK_PROMPT_FWD: True # use mask prompt during forward REGION_RESIZED: True # resize to the input of clip, e.g., 224 CLIP_ENSEMBLE: True # use ensemble of two classification branches CLIP_ENSEMBLE_WEIGHT: 0.0 DATASETS: TRAIN: ("coco_2017_train_stuff_sem_seg",) TEST: ("ade20k_sem_seg_val",) SOLVER: IMS_PER_BATCH: 32 BASE_LR: 0.00006 MAX_ITER: 120000 WARMUP_FACTOR: 1e-6 WARMUP_ITERS: 1500 WEIGHT_DECAY: 0.01 WEIGHT_DECAY_NORM: 0.0 WEIGHT_DECAY_EMBED: 0.0 BACKBONE_MULTIPLIER: 1.0 TEST_IMS_PER_BATCH: 1 CLIP_GRADIENTS: ENABLED: True CLIP_TYPE: "full_model" CLIP_VALUE: 0.01 NORM_TYPE: 2.0 INPUT: MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] MIN_SIZE_TRAIN_SAMPLING: "choice" MIN_SIZE_TEST: 640 MAX_SIZE_TRAIN: 2560 MAX_SIZE_TEST: 2560 CROP: ENABLED: True TYPE: "absolute" SIZE: (640, 640) SINGLE_CATEGORY_MAX_AREA: 1.0 COLOR_AUG_SSD: True SIZE_DIVISIBILITY: 640 # used in dataset mapper FORMAT: "RGB" TEST: EVAL_PERIOD: 5000 AUG: ENABLED: False MIN_SIZES: [256, 384, 512, 640, 768, 896] MAX_SIZE: 3584 FLIP: True DATALOADER: FILTER_EMPTY_ANNOTATIONS: True NUM_WORKERS: 4 VERSION: 2