Spaces:
Paused
Paused
MODEL: | |
META_ARCHITECTURE: "STAnything" | |
MASK_ON: True | |
VISUAL_PROMPT: True | |
BACKBONE: | |
FREEZE_AT: 0 | |
NAME: "build_resnet_backbone" | |
WEIGHTS: "weights/bert_r50_coco.pth" | |
PIXEL_MEAN: [123.675, 116.280, 103.530] | |
PIXEL_STD: [58.395, 57.120, 57.375] | |
RESNETS: | |
DEPTH: 50 | |
STEM_TYPE: "basic" # not used | |
STEM_OUT_CHANNELS: 64 | |
STRIDE_IN_1X1: False | |
OUT_FEATURES: ["res2", "res3", "res4", "res5"] | |
# NORM: "SyncBN" | |
RES5_MULTI_GRID: [1, 1, 1] # not used | |
SEM_SEG_HEAD: | |
NAME: "MaskDINOHead" | |
IGNORE_VALUE: 255 | |
NUM_CLASSES: 80 | |
LOSS_WEIGHT: 1.0 | |
CONVS_DIM: 256 | |
MASK_DIM: 256 | |
NORM: "GN" | |
# pixel decoder | |
PIXEL_DECODER_NAME: "MaskDINOEncoder" | |
DIM_FEEDFORWARD: 1024 | |
NUM_FEATURE_LEVELS: 3 | |
TOTAL_NUM_FEATURE_LEVELS: 3 | |
IN_FEATURES: ["res2", "res3", "res4", "res5"] | |
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] | |
COMMON_STRIDE: 4 | |
TRANSFORMER_ENC_LAYERS: 6 | |
MaskDINO: | |
TRANSFORMER_DECODER_NAME: "MaskDINODecoder" | |
DEEP_SUPERVISION: True | |
NO_OBJECT_WEIGHT: 0.1 | |
CLASS_WEIGHT: 4.0 | |
MASK_WEIGHT: 5.0 | |
DICE_WEIGHT: 5.0 | |
BOX_WEIGHT: 5.0 | |
GIOU_WEIGHT: 2.0 | |
HIDDEN_DIM: 256 | |
NUM_OBJECT_QUERIES: 300 | |
NHEADS: 8 | |
DROPOUT: 0.0 | |
DIM_FEEDFORWARD: 2048 | |
ENC_LAYERS: 0 | |
PRE_NORM: False | |
ENFORCE_INPUT_PROJ: False | |
SIZE_DIVISIBILITY: 32 | |
DEC_LAYERS: 9 # 9+1, 9 decoder layers, add one for the loss on learnable query | |
TRAIN_NUM_POINTS: 12544 | |
OVERSAMPLE_RATIO: 3.0 | |
IMPORTANCE_SAMPLE_RATIO: 0.75 | |
INITIAL_PRED: True | |
TWO_STAGE: True | |
DN: "standard" | |
DN_NUM: 100 | |
INITIALIZE_BOX_TYPE: "no" | |
TEST: | |
SEMANTIC_ON: False | |
INSTANCE_ON: True | |
PANOPTIC_ON: False | |
OVERLAP_THRESHOLD: 0.8 | |
OBJECT_MASK_THRESHOLD: 0.25 | |
TEXT: | |
ARCH: clip_teacher | |
LANGUAGE_BACKBONE: | |
LANG_DIM: 512 | |