Spaces:
Paused
Paused
# -------------------------------------------------------- | |
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language | |
# Copyright (c) 2022 Microsoft | |
# Licensed under The MIT License [see LICENSE for details] | |
# Written by Xueyan Zou (xueyan@cs.wisc.edu) | |
# -------------------------------------------------------- | |
################## | |
# Task settings | |
################## | |
VERBOSE: true | |
MODEL: | |
NAME: xdecoder_model | |
HEAD: xdecoder_head | |
DIM_PROJ: 512 | |
BACKBONE_DIM: 768 | |
TEXT: | |
ARCH: vlpencoder | |
NAME: transformer | |
TOKENIZER: clip | |
CONTEXT_LENGTH: 77 # 77 | |
WIDTH: 512 | |
HEADS: 8 | |
LAYERS: 12 # 6 | |
AUTOGRESSIVE: True | |
BACKBONE: | |
NAME: focal_dw | |
PRETRAINED: '' | |
LOAD_PRETRAINED: false | |
FOCAL: | |
PRETRAIN_IMG_SIZE: 224 | |
PATCH_SIZE: 4 | |
EMBED_DIM: 96 | |
DEPTHS: [2, 2, 6, 2] | |
FOCAL_LEVELS: [3, 3, 3, 3] | |
FOCAL_WINDOWS: [3, 3, 3, 3] | |
DROP_PATH_RATE: 0.3 | |
MLP_RATIO: 4.0 | |
DROP_RATE: 0.0 | |
PATCH_NORM: True | |
USE_CONV_EMBED: True | |
SCALING_MODULATOR: True | |
USE_CHECKPOINT: False | |
USE_POSTLN: true | |
USE_POSTLN_IN_MODULATION: false | |
USE_LAYERSCALE: True | |
OUT_FEATURES: ["res2", "res3", "res4", "res5"] | |
OUT_INDICES: [0, 1, 2, 3] | |
ENCODER: | |
NAME: transformer_encoder_fpn | |
IGNORE_VALUE: 255 | |
NUM_CLASSES: 133 | |
LOSS_WEIGHT: 1.0 | |
CONVS_DIM: 512 | |
MASK_DIM: 512 | |
NORM: "GN" | |
IN_FEATURES: ["res2", "res3", "res4", "res5"] | |
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"] | |
COMMON_STRIDE: 4 | |
TRANSFORMER_ENC_LAYERS: 6 | |
DECODER: | |
NAME: xdecoder | |
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder" | |
MASK: True | |
GROUNDING: | |
ENABLED: True | |
MAX_LEN: 5 | |
TEXT_WEIGHT: 2.0 | |
CLASS_WEIGHT: 0.5 | |
DETECTION: False | |
CAPTION: | |
ENABLED: True | |
PHRASE_PROB: 0.0 | |
SIM_THRES: 0.95 | |
CAPTIONING: | |
ENABLED: True | |
STEP: 50 | |
RETRIEVAL: | |
ENABLED: True | |
DIM_IMG: 768 | |
ENSEMBLE: True | |
HIDDEN_DIM: 512 | |
NUM_OBJECT_QUERIES: 101 | |
NHEADS: 8 | |
DROPOUT: 0.0 | |
DIM_FEEDFORWARD: 2048 | |
PRE_NORM: False | |
ENFORCE_INPUT_PROJ: False | |
SIZE_DIVISIBILITY: 32 | |
TRAIN_NUM_POINTS: 12544 | |
OVERSAMPLE_RATIO: 3.0 | |
IMPORTANCE_SAMPLE_RATIO: 0.75 | |
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query | |
TOP_GROUNDING_LAYERS: 3 | |
TOP_CAPTION_LAYERS: 3 | |
TOP_CAPTIONING_LAYERS: 3 | |
TOP_RETRIEVAL_LAYERS: 3 | |
TOP_OPENIMAGE_LAYERS: 10 | |
TEST: | |
SEMANTIC_ON: True | |
INSTANCE_ON: True | |
PANOPTIC_ON: True | |
OVERLAP_THRESHOLD: 0.8 | |
OBJECT_MASK_THRESHOLD: 0.4 | |
SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false | |
DETECTIONS_PER_IMAGE: 100 | |
INPUT: | |
PIXEL_MEAN: [123.675, 116.280, 103.530] | |
PIXEL_STD: [58.395, 57.120, 57.375] |