MODEL: META_ARCHITECTURE: "STAnything" MASK_ON: True VISUAL_PROMPT: True BACKBONE: FREEZE_AT: 0 NAME: "build_resnet_backbone" WEIGHTS: "weights/bert_r50_coco.pth" PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] RESNETS: DEPTH: 50 STEM_TYPE: "basic" # not used STEM_OUT_CHANNELS: 64 STRIDE_IN_1X1: False OUT_FEATURES: ["res2", "res3", "res4", "res5"] # NORM: "SyncBN" RES5_MULTI_GRID: [1, 1, 1] # not used SEM_SEG_HEAD: NAME: "MaskDINOHead" IGNORE_VALUE: 255 NUM_CLASSES: 80 LOSS_WEIGHT: 1.0 CONVS_DIM: 256 MASK_DIM: 256 NORM: "GN" # pixel decoder PIXEL_DECODER_NAME: "MaskDINOEncoder" DIM_FEEDFORWARD: 1024 NUM_FEATURE_LEVELS: 3 TOTAL_NUM_FEATURE_LEVELS: 3 IN_FEATURES: ["res2", "res3", "res4", "res5"] DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] COMMON_STRIDE: 4 TRANSFORMER_ENC_LAYERS: 6 MaskDINO: TRANSFORMER_DECODER_NAME: "MaskDINODecoder" DEEP_SUPERVISION: True NO_OBJECT_WEIGHT: 0.1 CLASS_WEIGHT: 4.0 MASK_WEIGHT: 5.0 DICE_WEIGHT: 5.0 BOX_WEIGHT: 5.0 GIOU_WEIGHT: 2.0 HIDDEN_DIM: 256 NUM_OBJECT_QUERIES: 300 NHEADS: 8 DROPOUT: 0.0 DIM_FEEDFORWARD: 2048 ENC_LAYERS: 0 PRE_NORM: False ENFORCE_INPUT_PROJ: False SIZE_DIVISIBILITY: 32 DEC_LAYERS: 9 # 9+1, 9 decoder layers, add one for the loss on learnable query TRAIN_NUM_POINTS: 12544 OVERSAMPLE_RATIO: 3.0 IMPORTANCE_SAMPLE_RATIO: 0.75 INITIAL_PRED: True TWO_STAGE: True DN: "standard" DN_NUM: 100 INITIALIZE_BOX_TYPE: "no" TEST: SEMANTIC_ON: False INSTANCE_ON: True PANOPTIC_ON: False OVERLAP_THRESHOLD: 0.8 OBJECT_MASK_THRESHOLD: 0.25 TEXT: ARCH: clip_teacher LANGUAGE_BACKBONE: LANG_DIM: 512