Spaces:
Runtime error
Runtime error
# ------------------------------------------------------------------------ | |
# Semantic SAM | |
# Copyright (c) MicroSoft, Inc. and its affiliates. | |
# Modified from OpenSeed https://github.com/IDEA-Research/OpenSeed by Feng Li. | |
# ------------------------------------------------------------------------ | |
################## | |
# Task settings | |
################## | |
WEIGHT: '' | |
PORT: 53711 | |
VERBOSE: true | |
OUTPUT_DIR: '../../data/output/test' | |
# misc | |
LOADER: | |
JOINT: True | |
KEY_DATASET: 'coco' | |
# model | |
MODEL: | |
NAME: interactive_mask_dino | |
HEAD: general_head | |
MASK_ON: false | |
KEYPOINT_ON: false | |
LOAD_PROPOSALS: false | |
DIM_PROJ: 512 | |
BACKBONE_DIM: 768 | |
BACKGROUND: False | |
WEIGHTS: '' | |
TEXT: | |
ARCH: noencoder # no language encoder for training only sa-1b data | |
NAME: transformer | |
TOKENIZER: clip | |
CONTEXT_LENGTH: 18 # 77 | |
WIDTH: 512 | |
HEADS: 8 | |
LAYERS: 12 # 6 | |
AUTOGRESSIVE: True | |
BACKBONE: | |
NAME: swin | |
PRETRAINED: 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' | |
LOAD_PRETRAINED: true | |
SWIN: | |
PRETRAIN_IMG_SIZE: 384 | |
PATCH_SIZE: 4 | |
EMBED_DIM: 192 | |
DEPTHS: [ 2, 2, 18, 2 ] | |
NUM_HEADS: [ 6, 12, 24, 48 ] | |
WINDOW_SIZE: 12 | |
MLP_RATIO: 4.0 | |
QKV_BIAS: true | |
QK_SCALE: ~ | |
DROP_RATE: 0.0 | |
ATTN_DROP_RATE: 0.0 | |
DROP_PATH_RATE: 0.3 | |
APE: false | |
PATCH_NORM: true | |
USE_CHECKPOINT: false | |
OUT_FEATURES: [ 'res2', 'res3', 'res4', 'res5' ] | |
ENCODER: | |
NAME: encoder_deform | |
IGNORE_VALUE: 255 | |
NUM_CLASSES: 1 | |
LOSS_WEIGHT: 1.0 | |
CONVS_DIM: 256 | |
MASK_DIM: 256 | |
NORM: "GN" | |
IN_FEATURES: [ "res2", "res3", "res4", "res5" ] | |
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: [ "res3", "res4", "res5" ] | |
COMMON_STRIDE: 4 | |
TRANSFORMER_ENC_LAYERS: 6 | |
TOTAL_NUM_FEATURE_LEVELS: 4 | |
NUM_FEATURE_LEVELS: 3 | |
FEATURE_ORDER: "low2high" | |
DECODER: | |
NAME: interactive_mask_dino | |
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" | |
MASK: True | |
BOX: True | |
PART: True | |
GROUNDING: | |
ENABLED: False | |
MAX_LEN: 5 | |
TEXT_WEIGHT: 2.0 | |
CLASS_WEIGHT: 0.5 | |
CAPTION: | |
ENABLED: False | |
PHRASE_PROB: 0.0 | |
SIM_THRES: 0.95 | |
CAPTIONING: | |
ENABLED: False | |
STEP: 50 | |
RETRIEVAL: | |
ENABLED: False | |
DIM_IMG: 768 | |
ENSEMBLE: True | |
OPENIMAGE: | |
ENABLED: False | |
NEGATIVE_SAMPLES: 5 | |
GROUNDING: | |
ENABLED: False | |
MAX_LEN: 5 | |
DEEP_SUPERVISION: True | |
NO_OBJECT_WEIGHT: 0.1 | |
CLASS_WEIGHT: 4.0 | |
MASK_WEIGHT: 5.0 | |
DICE_WEIGHT: 5.0 | |
BOX_WEIGHT: 5.0 | |
GIOU_WEIGHT: 2.0 | |
IOU_WEIGHT: 1.0 | |
COST_CLASS_WEIGHT: 4.0 | |
COST_DICE_WEIGHT: 5.0 | |
COST_MASK_WEIGHT: 5.0 | |
COST_BOX_WEIGHT: 5.0 | |
COST_GIOU_WEIGHT: 2.0 | |
HIDDEN_DIM: 256 | |
NUM_OBJECT_QUERIES: 0 | |
NHEADS: 8 | |
DROPOUT: 0.0 | |
DIM_FEEDFORWARD: 2048 | |
ENC_LAYERS: 0 | |
PRE_NORM: False | |
ENFORCE_INPUT_PROJ: False | |
SIZE_DIVISIBILITY: 32 | |
DEC_LAYERS: 9 # 9 decoder layers, add one for the loss on learnable query | |
TRAIN_NUM_POINTS: 12544 | |
OVERSAMPLE_RATIO: 3.0 | |
IMPORTANCE_SAMPLE_RATIO: 0.75 | |
TWO_STAGE: False | |
INITIALIZE_BOX_TYPE: 'no' | |
DN: seg | |
DN_NOISE_SCALE: 0.4 | |
DN_NUM: 100 | |
INITIAL_PRED: False | |
LEARN_TGT: False | |
TOTAL_NUM_FEATURE_LEVELS: 4 | |
SEMANTIC_CE_LOSS: False | |
PANO_BOX_LOSS: False | |
COCO: False | |
O365: False | |
SAM: True | |
PASCAL: False | |
RE_POINT: True | |
NUM_INTERACTIVE_TOKENS: 6 | |
MAX_NUM_INSTANCE: 60 | |
TEST: | |
SEMANTIC_ON: True | |
INSTANCE_ON: True | |
PANOPTIC_ON: True | |
BOX_INTERACTIVE: False | |
CLASSIFICATION_ON: False | |
OVERLAP_THRESHOLD: 0.8 | |
OBJECT_MASK_THRESHOLD: 0.25 | |
SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false | |
TEST_FOUCUS_ON_BOX: False | |
PANO_TRANSFORM_EVAL: True | |
PANO_TEMPERATURE: 0.06 | |
TEST: | |
EVAL_PERIOD: 500000 | |
PRECISE_BN: | |
NUM_ITER: 1 | |
ENABLED: False | |
AUG: | |
ENABLED: False | |
SAM: | |
INPUT: | |
MIN_SIZE_TEST: 800 | |
MAX_SIZE_TEST: 1333 | |
IMAGE_SIZE: 1024 | |
MIN_SCALE: 0.99 | |
MAX_SCALE: 1.01 | |
DATASET_MAPPER_NAME: "sam" | |
IGNORE_VALUE: 255 | |
COLOR_AUG_SSD: False | |
SIZE_DIVISIBILITY: 32 | |
RANDOM_FLIP: "horizontal" | |
MASK_FORMAT: "polygon" | |
FORMAT: "RGB" | |
CROP: | |
ENABLED: True | |
DATASET: | |
DATASET: 'sam' | |
TEST: | |
DETECTIONS_PER_IMAGE: 100 | |
NAME: coco_eval | |
IOU_TYPE: ['bbox', 'segm'] | |
USE_MULTISCALE: false | |
BATCH_SIZE_TOTAL: 8 | |
MODEL_FILE: '' | |
AUG: | |
ENABLED: False | |
TRAIN: | |
BATCH_SIZE_TOTAL: 1 | |
BATCH_SIZE_PER_GPU: 1 | |
SHUFFLE: true | |
DATALOADER: | |
FILTER_EMPTY_ANNOTATIONS: False | |
NUM_WORKERS: 4 | |
LOAD_PROPOSALS: False | |
SAMPLER_TRAIN: "TrainingSampler" | |
ASPECT_RATIO_GROUPING: True | |
COCO: | |
INPUT: | |
MIN_SIZE_TEST: 800 | |
MAX_SIZE_TEST: 1333 | |
IMAGE_SIZE: 1024 | |
MIN_SCALE: 0.1 | |
MAX_SCALE: 2.0 | |
DATASET_MAPPER_NAME: "coco_interactive_panoptic_lsj" | |
IGNORE_VALUE: 255 | |
COLOR_AUG_SSD: False | |
SIZE_DIVISIBILITY: 32 | |
RANDOM_FLIP: "horizontal" | |
MASK_FORMAT: "polygon" | |
FORMAT: "RGB" | |
CROP: | |
ENABLED: True | |
DATASET: | |
DATASET: 'coco' | |
TEST: | |
DETECTIONS_PER_IMAGE: 100 | |
NAME: coco_eval | |
IOU_TYPE: ['bbox', 'segm'] | |
USE_MULTISCALE: false | |
BATCH_SIZE_TOTAL: 1 | |
MODEL_FILE: '' | |
AUG: | |
ENABLED: False | |
TRAIN: | |
BATCH_SIZE_TOTAL: 1 | |
BATCH_SIZE_PER_GPU: 1 | |
SHUFFLE: true | |
DATALOADER: | |
FILTER_EMPTY_ANNOTATIONS: False | |
NUM_WORKERS: 2 | |
LOAD_PROPOSALS: False | |
SAMPLER_TRAIN: "TrainingSampler" | |
ASPECT_RATIO_GROUPING: True | |
VLP: | |
INPUT: | |
IMAGE_SIZE: 224 | |
DATASET_MAPPER_NAME: "vlpretrain" | |
IGNORE_VALUE: 255 | |
COLOR_AUG_SSD: False | |
SIZE_DIVISIBILITY: 32 | |
MASK_FORMAT: "polygon" | |
FORMAT: "RGB" | |
CROP: | |
ENABLED: True | |
TRAIN: | |
BATCH_SIZE_TOTAL: 2 | |
BATCH_SIZE_PER_GPU: 2 | |
TEST: | |
BATCH_SIZE_TOTAL: 256 | |
DATALOADER: | |
FILTER_EMPTY_ANNOTATIONS: False | |
NUM_WORKERS: 16 | |
LOAD_PROPOSALS: False | |
SAMPLER_TRAIN: "TrainingSampler" | |
ASPECT_RATIO_GROUPING: True | |
INPUT: | |
PIXEL_MEAN: [123.675, 116.280, 103.530] | |
PIXEL_STD: [58.395, 57.120, 57.375] | |
DATASETS: | |
TRAIN: ["sam_train"] | |
# interactive segmentation evaluation. | |
TEST: ["coco_2017_val_panoptic_with_sem_seg_interactive_jointboxpoint"] | |
# TEST: ["sam_minival"] | |
CLASS_CONCAT: false | |
SIZE_DIVISIBILITY: 32 | |
PROPOSAL_FILES_TRAIN: [] | |
DATALOADER: | |
FILTER_EMPTY_ANNOTATIONS: False | |
NUM_WORKERS: 16 | |
LOAD_PROPOSALS: False | |
SAMPLER_TRAIN: "TrainingSampler" | |
ASPECT_RATIO_GROUPING: True | |
# Detectron2 training config for optimizer and lr scheduler | |
SOLVER: | |
BASE_LR_END: 0.0 | |
MOMENTUM: 0.9 | |
NESTEROV: False | |
CHECKPOINT_PERIOD: 5000 | |
IMS_PER_BATCH: 1 | |
REFERENCE_WORLD_SIZE: 0 | |
BIAS_LR_FACTOR: 1.0 | |
WEIGHT_DECAY_BIAS: None | |
# original | |
BASE_LR: 0.0001 | |
STEPS: [327778, 355092] | |
MAX_ITER: 368750 | |
GAMMA: 0.1 | |
WARMUP_FACTOR: 1.0 | |
WARMUP_ITERS: 10 | |
WARMUP_METHOD: "linear" | |
WEIGHT_DECAY: 0.05 | |
OPTIMIZER: "ADAMW" | |
LR_SCHEDULER_NAME: "WarmupMultiStepLR" | |
LR_MULTIPLIER: | |
backbone: 0.1 | |
lang_encoder: 0.1 | |
WEIGHT_DECAY_NORM: 0.0 | |
WEIGHT_DECAY_EMBED: 0.0 | |
CLIP_GRADIENTS: | |
ENABLED: True | |
CLIP_TYPE: "full_model" | |
CLIP_VALUE: 0.01 | |
NORM_TYPE: 2.0 | |
AMP: | |
ENABLED: True | |
# Evaluation Dataset | |
ADE20K: | |
INPUT: | |
MIN_SIZE_TRAIN: [320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280] | |
MIN_SIZE_TRAIN_SAMPLING: "choice" | |
MIN_SIZE_TEST: 640 | |
MAX_SIZE_TRAIN: 2560 | |
MAX_SIZE_TEST: 2560 | |
MASK_FORMAT: "polygon" | |
CROP: | |
ENABLED: True | |
TYPE: "absolute" | |
SIZE: [640, 640] | |
SINGLE_CATEGORY_MAX_AREA: 1.0 | |
IGNORE_VALUE: 255 | |
COLOR_AUG_SSD: True | |
SIZE_DIVISIBILITY: 640 # used in dataset mapper | |
DATASET_MAPPER_NAME: "mask_former_panoptic" | |
FORMAT: "RGB" | |
DATASET: | |
DATASET: 'ade' | |
TRAIN: | |
ASPECT_RATIO_GROUPING: true | |
BATCH_SIZE_TOTAL: 16 | |
BATCH_SIZE_PER_GPU: 2 | |
SHUFFLE: true | |
TEST: | |
DETECTIONS_PER_IMAGE: 100 | |
NAME: coco_eval | |
IOU_TYPE: ['bbox', 'segm'] | |
USE_MULTISCALE: false | |
BATCH_SIZE_TOTAL: 8 | |
MODEL_FILE: '' | |
AUG: | |
ENABLED: False | |
DATALOADER: | |
FILTER_EMPTY_ANNOTATIONS: False | |
NUM_WORKERS: 8 | |
LOAD_PROPOSALS: False | |
SAMPLER_TRAIN: "TrainingSampler" | |
ASPECT_RATIO_GROUPING: True | |
#ADE20K: | |
# INPUT: | |
# MIN_SIZE_TRAIN: 640 | |
# MIN_SIZE_TRAIN_SAMPLING: "choice" | |
# MIN_SIZE_TEST: 640 | |
# MAX_SIZE_TRAIN: 2560 | |
# MAX_SIZE_TEST: 2560 | |
# MASK_FORMAT: "polygon" | |
# CROP: | |
# ENABLED: True | |
# TYPE: "absolute" | |
# SIZE: (640, 640) | |
# SINGLE_CATEGORY_MAX_AREA: 1.0 | |
# COLOR_AUG_SSD: True | |
# SIZE_DIVISIBILITY: 640 # used in dataset mapper | |
# DATASET_MAPPER_NAME: "mask_former_panoptic" | |
# FORMAT: "RGB" | |
# DATASET: | |
# DATASET: 'ade' | |
# TEST: | |
# BATCH_SIZE_TOTAL: 8 | |
REF: | |
INPUT: | |
PIXEL_MEAN: [123.675, 116.280, 103.530] | |
PIXEL_STD: [58.395, 57.120, 57.375] | |
MIN_SIZE_TEST: 512 | |
MAX_SIZE_TEST: 1024 | |
FORMAT: "RGB" | |
DATALOADER: | |
FILTER_EMPTY_ANNOTATIONS: False | |
NUM_WORKERS: 0 | |
LOAD_PROPOSALS: False | |
SAMPLER_TRAIN: "TrainingSampler" | |
ASPECT_RATIO_GROUPING: False | |
TEST: | |
BATCH_SIZE_TOTAL: 8 | |
SUN: | |
INPUT: | |
PIXEL_MEAN: [123.675, 116.280, 103.530] | |
PIXEL_STD: [58.395, 57.120, 57.375] | |
MIN_SIZE_TEST: 512 | |
MAX_SIZE_TEST: 1024 | |
DATALOADER: | |
FILTER_EMPTY_ANNOTATIONS: False | |
NUM_WORKERS: 0 | |
LOAD_PROPOSALS: False | |
SAMPLER_TRAIN: "TrainingSampler" | |
ASPECT_RATIO_GROUPING: False | |
TEST: | |
BATCH_SIZE_TOTAL: 8 | |
SCAN: | |
INPUT: | |
PIXEL_MEAN: [123.675, 116.280, 103.530] | |
PIXEL_STD: [58.395, 57.120, 57.375] | |
MIN_SIZE_TEST: 512 | |
MAX_SIZE_TEST: 1024 | |
DATALOADER: | |
FILTER_EMPTY_ANNOTATIONS: False | |
NUM_WORKERS: 0 | |
LOAD_PROPOSALS: False | |
SAMPLER_TRAIN: "TrainingSampler" | |
ASPECT_RATIO_GROUPING: False | |
TEST: | |
BATCH_SIZE_TOTAL: 8 | |
BDD: | |
INPUT: | |
PIXEL_MEAN: [123.675, 116.280, 103.530] | |
PIXEL_STD: [58.395, 57.120, 57.375] | |
MIN_SIZE_TEST: 800 | |
MAX_SIZE_TEST: 1333 | |
DATALOADER: | |
FILTER_EMPTY_ANNOTATIONS: False | |
NUM_WORKERS: 0 | |
LOAD_PROPOSALS: False | |
SAMPLER_TRAIN: "TrainingSampler" | |
ASPECT_RATIO_GROUPING: False | |
TEST: | |
BATCH_SIZE_TOTAL: 8 | |
CITY: | |
INPUT: | |
MIN_SIZE_TRAIN: [ 512, 614, 716, 819, 921, 1024, 1126, 1228, 1331, 1433, 1536, 1638, 1740, 1843, 1945, 2048 ] | |
MIN_SIZE_TRAIN_SAMPLING: "choice" | |
MIN_SIZE_TEST: 1024 | |
MAX_SIZE_TRAIN: 4096 | |
MAX_SIZE_TEST: 2048 | |
CROP: | |
ENABLED: True | |
TYPE: "absolute" | |
SIZE: [ 512, 1024 ] | |
SINGLE_CATEGORY_MAX_AREA: 1.0 | |
IGNORE_VALUE: 255 | |
COLOR_AUG_SSD: True | |
SIZE_DIVISIBILITY: -1 | |
FORMAT: "RGB" | |
DATASET_MAPPER_NAME: "mask_former_panoptic" | |
MASK_FORMAT: "polygon" | |
TEST: | |
EVAL_PERIOD: 5000 | |
BATCH_SIZE_TOTAL: 1 | |
AUG: | |
ENABLED: False | |
MIN_SIZES: [ 512, 768, 1024, 1280, 1536, 1792 ] | |
MAX_SIZE: 4096 | |
FLIP: True | |
DATALOADER: | |
FILTER_EMPTY_ANNOTATIONS: True | |
NUM_WORKERS: 2 | |
LOAD_PROPOSALS: False | |
SAMPLER_TRAIN: "TrainingSampler" | |
ASPECT_RATIO_GROUPING: True | |
TRAIN: | |
ASPECT_RATIO_GROUPING: true | |
BATCH_SIZE_TOTAL: 2 | |
BATCH_SIZE_PER_GPU: 2 | |
SHUFFLE: true | |
PSACAL_PART: | |
INPUT: | |
MIN_SIZE_TEST: 800 | |
MAX_SIZE_TEST: 1333 | |
IMAGE_SIZE: 1024 | |
MIN_SCALE: 0.1 | |
MAX_SCALE: 2.0 | |
DATASET_MAPPER_NAME: "pascal_part_lsj" | |
IGNORE_VALUE: 255 | |
COLOR_AUG_SSD: False | |
SIZE_DIVISIBILITY: 32 | |
RANDOM_FLIP: "horizontal" | |
MASK_FORMAT: "polygon" | |
FORMAT: "RGB" | |
CROP: | |
ENABLED: True | |
MODEL: | |
MASK_ON: True | |
KEYPOINT_ON: False | |
LOAD_PROPOSALS: False | |
# DATASET: | |
# DATASET: 'coco' | |
TEST: | |
DETECTIONS_PER_IMAGE: 100 | |
NAME: coco_eval | |
IOU_TYPE: ['bbox', 'segm'] | |
USE_MULTISCALE: false | |
BATCH_SIZE_TOTAL: 8 | |
MODEL_FILE: '' | |
AUG: | |
ENABLED: False | |
TRAIN: | |
BATCH_SIZE_TOTAL: 1 | |
BATCH_SIZE_PER_GPU: 1 | |
SHUFFLE: true | |
DATALOADER: | |
FILTER_EMPTY_ANNOTATIONS: False | |
NUM_WORKERS: 2 | |
LOAD_PROPOSALS: False | |
SAMPLER_TRAIN: "TrainingSampler" | |
ASPECT_RATIO_GROUPING: True | |