_BASE_: Base-Cityscapes-UnifiedSegmentation.yaml MODEL: META_ARCHITECTURE: "OneFormer" SEM_SEG_HEAD: NAME: "OneFormerHead" IGNORE_VALUE: 255 NUM_CLASSES: 19 LOSS_WEIGHT: 1.0 CONVS_DIM: 256 MASK_DIM: 256 NORM: "GN" # pixel decoder PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder" IN_FEATURES: ["res2", "res3", "res4", "res5"] DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] COMMON_STRIDE: 4 TRANSFORMER_ENC_LAYERS: 6 ONE_FORMER: TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder" TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" DEEP_SUPERVISION: True NO_OBJECT_WEIGHT: 0.1 CLASS_WEIGHT: 2.0 MASK_WEIGHT: 5.0 DICE_WEIGHT: 5.0 CONTRASTIVE_WEIGHT: 0.5 CONTRASTIVE_TEMPERATURE: 0.07 HIDDEN_DIM: 256 NUM_OBJECT_QUERIES: 150 USE_TASK_NORM: True NHEADS: 8 DROPOUT: 0.1 DIM_FEEDFORWARD: 2048 ENC_LAYERS: 0 PRE_NORM: False ENFORCE_INPUT_PROJ: False SIZE_DIVISIBILITY: 32 ENC_LAYERS: 0 CLASS_DEC_LAYERS: 2 DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query TRAIN_NUM_POINTS: 12544 OVERSAMPLE_RATIO: 3.0 IMPORTANCE_SAMPLE_RATIO: 0.75 TEXT_ENCODER: WIDTH: 256 CONTEXT_LENGTH: 77 NUM_LAYERS: 6 VOCAB_SIZE: 49408 PROJ_NUM_LAYERS: 2 N_CTX: 16 TEST: SEMANTIC_ON: True INSTANCE_ON: True PANOPTIC_ON: True OVERLAP_THRESHOLD: 0.8 OBJECT_MASK_THRESHOLD: 0.8 TASK: "panoptic" TEST: DETECTIONS_PER_IMAGE: 150