Ahsen Khaliq commited on
Commit
16aee22
·
1 Parent(s): 92b1fb6
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. cog.yaml +28 -0
  2. configs/ade20k/instance-segmentation/Base-ADE20K-InstanceSegmentation.yaml +61 -0
  3. configs/ade20k/instance-segmentation/maskformer2_R50_bs16_160k.yaml +44 -0
  4. configs/ade20k/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml +18 -0
  5. configs/ade20k/panoptic-segmentation/Base-ADE20K-PanopticSegmentation.yaml +61 -0
  6. configs/ade20k/panoptic-segmentation/maskformer2_R50_bs16_160k.yaml +44 -0
  7. configs/ade20k/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml +18 -0
  8. configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml +61 -0
  9. configs/ade20k/semantic-segmentation/maskformer2_R101_bs16_90k.yaml +11 -0
  10. configs/ade20k/semantic-segmentation/maskformer2_R50_bs16_160k.yaml +44 -0
  11. configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_384_bs16_160k_res640.yaml +37 -0
  12. configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml +37 -0
  13. configs/ade20k/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml +37 -0
  14. configs/ade20k/semantic-segmentation/swin/maskformer2_swin_small_bs16_160k.yaml +15 -0
  15. configs/ade20k/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_160k.yaml +15 -0
  16. configs/cityscapes/instance-segmentation/Base-Cityscapes-InstanceSegmentation.yaml +61 -0
  17. configs/cityscapes/instance-segmentation/maskformer2_R101_bs16_90k.yaml +11 -0
  18. configs/cityscapes/instance-segmentation/maskformer2_R50_bs16_90k.yaml +44 -0
  19. configs/cityscapes/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml +16 -0
  20. configs/cityscapes/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml +18 -0
  21. configs/cityscapes/instance-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml +15 -0
  22. configs/cityscapes/instance-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml +15 -0
  23. configs/cityscapes/panoptic-segmentation/Base-Cityscapes-PanopticSegmentation.yaml +61 -0
  24. configs/cityscapes/panoptic-segmentation/maskformer2_R101_bs16_90k.yaml +11 -0
  25. configs/cityscapes/panoptic-segmentation/maskformer2_R50_bs16_90k.yaml +44 -0
  26. configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml +16 -0
  27. configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml +18 -0
  28. configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml +15 -0
  29. configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml +15 -0
  30. configs/cityscapes/semantic-segmentation/Base-Cityscapes-SemanticSegmentation.yaml +61 -0
  31. configs/cityscapes/semantic-segmentation/maskformer2_R101_bs16_90k.yaml +11 -0
  32. configs/cityscapes/semantic-segmentation/maskformer2_R50_bs16_90k.yaml +44 -0
  33. configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml +16 -0
  34. configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml +18 -0
  35. configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml +15 -0
  36. configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml +15 -0
  37. configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml +47 -0
  38. configs/coco/instance-segmentation/maskformer2_R101_bs16_50ep.yaml +11 -0
  39. configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml +44 -0
  40. configs/coco/instance-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml +16 -0
  41. configs/coco/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml +16 -0
  42. configs/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml +21 -0
  43. configs/coco/instance-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml +15 -0
  44. configs/coco/instance-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml +15 -0
  45. configs/coco/panoptic-segmentation/Base-COCO-PanopticSegmentation.yaml +47 -0
  46. configs/coco/panoptic-segmentation/maskformer2_R101_bs16_50ep.yaml +11 -0
  47. configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml +45 -0
  48. configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml +16 -0
  49. configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml +16 -0
  50. configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml +21 -0
cog.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ build:
2
+ gpu: true
3
+ cuda: "10.1"
4
+ python_version: "3.8"
5
+ system_packages:
6
+ - "libgl1-mesa-glx"
7
+ - "libglib2.0-0"
8
+ python_packages:
9
+ - "ipython==7.30.1"
10
+ - "numpy==1.21.4"
11
+ - "torch==1.8.1"
12
+ - "torchvision==0.9.1"
13
+ - "opencv-python==4.5.5.62"
14
+ - "Shapely==1.8.0"
15
+ - "h5py==3.6.0"
16
+ - "scipy==1.7.3"
17
+ - "submitit==1.4.1"
18
+ - "scikit-image==0.19.1"
19
+ - "Cython==0.29.27"
20
+ - "timm==0.4.12"
21
+ run:
22
+ - pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html
23
+ - pip install git+https://github.com/cocodataset/panopticapi.git
24
+ - pip install git+https://github.com/mcordts/cityscapesScripts.git
25
+ - git clone https://github.com/facebookresearch/Mask2Former
26
+ - TORCH_CUDA_ARCH_LIST='7.5' FORCE_CUDA=1 python Mask2Former/mask2former/modeling/pixel_decoder/ops/setup.py build install
27
+
28
+ predict: "predict.py:Predictor"
configs/ade20k/instance-segmentation/Base-ADE20K-InstanceSegmentation.yaml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ BACKBONE:
3
+ FREEZE_AT: 0
4
+ NAME: "build_resnet_backbone"
5
+ WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
7
+ PIXEL_STD: [58.395, 57.120, 57.375]
8
+ RESNETS:
9
+ DEPTH: 50
10
+ STEM_TYPE: "basic" # not used
11
+ STEM_OUT_CHANNELS: 64
12
+ STRIDE_IN_1X1: False
13
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14
+ # NORM: "SyncBN"
15
+ RES5_MULTI_GRID: [1, 1, 1] # not used
16
+ DATASETS:
17
+ TRAIN: ("ade20k_instance_train",)
18
+ TEST: ("ade20k_instance_val",)
19
+ SOLVER:
20
+ IMS_PER_BATCH: 16
21
+ BASE_LR: 0.0001
22
+ MAX_ITER: 160000
23
+ WARMUP_FACTOR: 1.0
24
+ WARMUP_ITERS: 0
25
+ WEIGHT_DECAY: 0.05
26
+ OPTIMIZER: "ADAMW"
27
+ LR_SCHEDULER_NAME: "WarmupPolyLR"
28
+ BACKBONE_MULTIPLIER: 0.1
29
+ CLIP_GRADIENTS:
30
+ ENABLED: True
31
+ CLIP_TYPE: "full_model"
32
+ CLIP_VALUE: 0.01
33
+ NORM_TYPE: 2.0
34
+ AMP:
35
+ ENABLED: True
36
+ INPUT:
37
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
38
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
39
+ MIN_SIZE_TEST: 640
40
+ MAX_SIZE_TRAIN: 2560
41
+ MAX_SIZE_TEST: 2560
42
+ CROP:
43
+ ENABLED: True
44
+ TYPE: "absolute"
45
+ SIZE: (640, 640)
46
+ SINGLE_CATEGORY_MAX_AREA: 1.0
47
+ COLOR_AUG_SSD: True
48
+ SIZE_DIVISIBILITY: 640 # used in dataset mapper
49
+ FORMAT: "RGB"
50
+ DATASET_MAPPER_NAME: "mask_former_instance"
51
+ TEST:
52
+ EVAL_PERIOD: 5000
53
+ AUG:
54
+ ENABLED: False
55
+ MIN_SIZES: [320, 480, 640, 800, 960, 1120]
56
+ MAX_SIZE: 4480
57
+ FLIP: True
58
+ DATALOADER:
59
+ FILTER_EMPTY_ANNOTATIONS: True
60
+ NUM_WORKERS: 4
61
+ VERSION: 2
configs/ade20k/instance-segmentation/maskformer2_R50_bs16_160k.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: Base-ADE20K-InstanceSegmentation.yaml
2
+ MODEL:
3
+ META_ARCHITECTURE: "MaskFormer"
4
+ SEM_SEG_HEAD:
5
+ NAME: "MaskFormerHead"
6
+ IGNORE_VALUE: 255
7
+ NUM_CLASSES: 100
8
+ LOSS_WEIGHT: 1.0
9
+ CONVS_DIM: 256
10
+ MASK_DIM: 256
11
+ NORM: "GN"
12
+ # pixel decoder
13
+ PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
15
+ DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16
+ COMMON_STRIDE: 4
17
+ TRANSFORMER_ENC_LAYERS: 6
18
+ MASK_FORMER:
19
+ TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20
+ TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21
+ DEEP_SUPERVISION: True
22
+ NO_OBJECT_WEIGHT: 0.1
23
+ CLASS_WEIGHT: 2.0
24
+ MASK_WEIGHT: 5.0
25
+ DICE_WEIGHT: 5.0
26
+ HIDDEN_DIM: 256
27
+ NUM_OBJECT_QUERIES: 100
28
+ NHEADS: 8
29
+ DROPOUT: 0.0
30
+ DIM_FEEDFORWARD: 2048
31
+ ENC_LAYERS: 0
32
+ PRE_NORM: False
33
+ ENFORCE_INPUT_PROJ: False
34
+ SIZE_DIVISIBILITY: 32
35
+ DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
36
+ TRAIN_NUM_POINTS: 12544
37
+ OVERSAMPLE_RATIO: 3.0
38
+ IMPORTANCE_SAMPLE_RATIO: 0.75
39
+ TEST:
40
+ SEMANTIC_ON: True
41
+ INSTANCE_ON: True
42
+ PANOPTIC_ON: True
43
+ OVERLAP_THRESHOLD: 0.8
44
+ OBJECT_MASK_THRESHOLD: 0.8
configs/ade20k/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_160k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 192
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [6, 12, 24, 48]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
17
+ MASK_FORMER:
18
+ NUM_OBJECT_QUERIES: 200
configs/ade20k/panoptic-segmentation/Base-ADE20K-PanopticSegmentation.yaml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ BACKBONE:
3
+ FREEZE_AT: 0
4
+ NAME: "build_resnet_backbone"
5
+ WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
7
+ PIXEL_STD: [58.395, 57.120, 57.375]
8
+ RESNETS:
9
+ DEPTH: 50
10
+ STEM_TYPE: "basic" # not used
11
+ STEM_OUT_CHANNELS: 64
12
+ STRIDE_IN_1X1: False
13
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14
+ # NORM: "SyncBN"
15
+ RES5_MULTI_GRID: [1, 1, 1] # not used
16
+ DATASETS:
17
+ TRAIN: ("ade20k_panoptic_train",)
18
+ TEST: ("ade20k_panoptic_val",)
19
+ SOLVER:
20
+ IMS_PER_BATCH: 16
21
+ BASE_LR: 0.0001
22
+ MAX_ITER: 160000
23
+ WARMUP_FACTOR: 1.0
24
+ WARMUP_ITERS: 0
25
+ WEIGHT_DECAY: 0.05
26
+ OPTIMIZER: "ADAMW"
27
+ LR_SCHEDULER_NAME: "WarmupPolyLR"
28
+ BACKBONE_MULTIPLIER: 0.1
29
+ CLIP_GRADIENTS:
30
+ ENABLED: True
31
+ CLIP_TYPE: "full_model"
32
+ CLIP_VALUE: 0.01
33
+ NORM_TYPE: 2.0
34
+ AMP:
35
+ ENABLED: True
36
+ INPUT:
37
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
38
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
39
+ MIN_SIZE_TEST: 640
40
+ MAX_SIZE_TRAIN: 2560
41
+ MAX_SIZE_TEST: 2560
42
+ CROP:
43
+ ENABLED: True
44
+ TYPE: "absolute"
45
+ SIZE: (640, 640)
46
+ SINGLE_CATEGORY_MAX_AREA: 1.0
47
+ COLOR_AUG_SSD: True
48
+ SIZE_DIVISIBILITY: 640 # used in dataset mapper
49
+ FORMAT: "RGB"
50
+ DATASET_MAPPER_NAME: "mask_former_panoptic"
51
+ TEST:
52
+ EVAL_PERIOD: 5000
53
+ AUG:
54
+ ENABLED: False
55
+ MIN_SIZES: [320, 480, 640, 800, 960, 1120]
56
+ MAX_SIZE: 4480
57
+ FLIP: True
58
+ DATALOADER:
59
+ FILTER_EMPTY_ANNOTATIONS: True
60
+ NUM_WORKERS: 4
61
+ VERSION: 2
configs/ade20k/panoptic-segmentation/maskformer2_R50_bs16_160k.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: Base-ADE20K-PanopticSegmentation.yaml
2
+ MODEL:
3
+ META_ARCHITECTURE: "MaskFormer"
4
+ SEM_SEG_HEAD:
5
+ NAME: "MaskFormerHead"
6
+ IGNORE_VALUE: 255
7
+ NUM_CLASSES: 150
8
+ LOSS_WEIGHT: 1.0
9
+ CONVS_DIM: 256
10
+ MASK_DIM: 256
11
+ NORM: "GN"
12
+ # pixel decoder
13
+ PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
15
+ DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16
+ COMMON_STRIDE: 4
17
+ TRANSFORMER_ENC_LAYERS: 6
18
+ MASK_FORMER:
19
+ TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20
+ TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21
+ DEEP_SUPERVISION: True
22
+ NO_OBJECT_WEIGHT: 0.1
23
+ CLASS_WEIGHT: 2.0
24
+ MASK_WEIGHT: 5.0
25
+ DICE_WEIGHT: 5.0
26
+ HIDDEN_DIM: 256
27
+ NUM_OBJECT_QUERIES: 100
28
+ NHEADS: 8
29
+ DROPOUT: 0.0
30
+ DIM_FEEDFORWARD: 2048
31
+ ENC_LAYERS: 0
32
+ PRE_NORM: False
33
+ ENFORCE_INPUT_PROJ: False
34
+ SIZE_DIVISIBILITY: 32
35
+ DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
36
+ TRAIN_NUM_POINTS: 12544
37
+ OVERSAMPLE_RATIO: 3.0
38
+ IMPORTANCE_SAMPLE_RATIO: 0.75
39
+ TEST:
40
+ SEMANTIC_ON: True
41
+ INSTANCE_ON: True
42
+ PANOPTIC_ON: True
43
+ OVERLAP_THRESHOLD: 0.8
44
+ OBJECT_MASK_THRESHOLD: 0.8
configs/ade20k/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_160k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 192
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [6, 12, 24, 48]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
17
+ MASK_FORMER:
18
+ NUM_OBJECT_QUERIES: 200
configs/ade20k/semantic-segmentation/Base-ADE20K-SemanticSegmentation.yaml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ BACKBONE:
3
+ FREEZE_AT: 0
4
+ NAME: "build_resnet_backbone"
5
+ WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
7
+ PIXEL_STD: [58.395, 57.120, 57.375]
8
+ RESNETS:
9
+ DEPTH: 50
10
+ STEM_TYPE: "basic" # not used
11
+ STEM_OUT_CHANNELS: 64
12
+ STRIDE_IN_1X1: False
13
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14
+ # NORM: "SyncBN"
15
+ RES5_MULTI_GRID: [1, 1, 1] # not used
16
+ DATASETS:
17
+ TRAIN: ("ade20k_sem_seg_train",)
18
+ TEST: ("ade20k_sem_seg_val",)
19
+ SOLVER:
20
+ IMS_PER_BATCH: 16
21
+ BASE_LR: 0.0001
22
+ MAX_ITER: 160000
23
+ WARMUP_FACTOR: 1.0
24
+ WARMUP_ITERS: 0
25
+ WEIGHT_DECAY: 0.05
26
+ OPTIMIZER: "ADAMW"
27
+ LR_SCHEDULER_NAME: "WarmupPolyLR"
28
+ BACKBONE_MULTIPLIER: 0.1
29
+ CLIP_GRADIENTS:
30
+ ENABLED: True
31
+ CLIP_TYPE: "full_model"
32
+ CLIP_VALUE: 0.01
33
+ NORM_TYPE: 2.0
34
+ AMP:
35
+ ENABLED: True
36
+ INPUT:
37
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
38
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
39
+ MIN_SIZE_TEST: 512
40
+ MAX_SIZE_TRAIN: 2048
41
+ MAX_SIZE_TEST: 2048
42
+ CROP:
43
+ ENABLED: True
44
+ TYPE: "absolute"
45
+ SIZE: (512, 512)
46
+ SINGLE_CATEGORY_MAX_AREA: 1.0
47
+ COLOR_AUG_SSD: True
48
+ SIZE_DIVISIBILITY: 512 # used in dataset mapper
49
+ FORMAT: "RGB"
50
+ DATASET_MAPPER_NAME: "mask_former_semantic"
51
+ TEST:
52
+ EVAL_PERIOD: 5000
53
+ AUG:
54
+ ENABLED: False
55
+ MIN_SIZES: [256, 384, 512, 640, 768, 896]
56
+ MAX_SIZE: 3584
57
+ FLIP: True
58
+ DATALOADER:
59
+ FILTER_EMPTY_ANNOTATIONS: True
60
+ NUM_WORKERS: 4
61
+ VERSION: 2
configs/ade20k/semantic-segmentation/maskformer2_R101_bs16_90k.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: maskformer2_R50_bs16_160k.yaml
2
+ MODEL:
3
+ WEIGHTS: "R-101.pkl"
4
+ RESNETS:
5
+ DEPTH: 101
6
+ STEM_TYPE: "basic" # not used
7
+ STEM_OUT_CHANNELS: 64
8
+ STRIDE_IN_1X1: False
9
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10
+ NORM: "SyncBN"
11
+ RES5_MULTI_GRID: [1, 1, 1] # not used
configs/ade20k/semantic-segmentation/maskformer2_R50_bs16_160k.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: Base-ADE20K-SemanticSegmentation.yaml
2
+ MODEL:
3
+ META_ARCHITECTURE: "MaskFormer"
4
+ SEM_SEG_HEAD:
5
+ NAME: "MaskFormerHead"
6
+ IGNORE_VALUE: 255
7
+ NUM_CLASSES: 150
8
+ LOSS_WEIGHT: 1.0
9
+ CONVS_DIM: 256
10
+ MASK_DIM: 256
11
+ NORM: "GN"
12
+ # pixel decoder
13
+ PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
15
+ DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16
+ COMMON_STRIDE: 4
17
+ TRANSFORMER_ENC_LAYERS: 6
18
+ MASK_FORMER:
19
+ TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20
+ TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21
+ DEEP_SUPERVISION: True
22
+ NO_OBJECT_WEIGHT: 0.1
23
+ CLASS_WEIGHT: 2.0
24
+ MASK_WEIGHT: 5.0
25
+ DICE_WEIGHT: 5.0
26
+ HIDDEN_DIM: 256
27
+ NUM_OBJECT_QUERIES: 100
28
+ NHEADS: 8
29
+ DROPOUT: 0.0
30
+ DIM_FEEDFORWARD: 2048
31
+ ENC_LAYERS: 0
32
+ PRE_NORM: False
33
+ ENFORCE_INPUT_PROJ: False
34
+ SIZE_DIVISIBILITY: 32
35
+ DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
36
+ TRAIN_NUM_POINTS: 12544
37
+ OVERSAMPLE_RATIO: 3.0
38
+ IMPORTANCE_SAMPLE_RATIO: 0.75
39
+ TEST:
40
+ SEMANTIC_ON: True
41
+ INSTANCE_ON: False
42
+ PANOPTIC_ON: False
43
+ OVERLAP_THRESHOLD: 0.8
44
+ OBJECT_MASK_THRESHOLD: 0.8
configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_384_bs16_160k_res640.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_160k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 128
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [4, 8, 16, 32]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_base_patch4_window12_384.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
17
+ INPUT:
18
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
20
+ MIN_SIZE_TEST: 640
21
+ MAX_SIZE_TRAIN: 2560
22
+ MAX_SIZE_TEST: 2560
23
+ CROP:
24
+ ENABLED: True
25
+ TYPE: "absolute"
26
+ SIZE: (640, 640)
27
+ SINGLE_CATEGORY_MAX_AREA: 1.0
28
+ COLOR_AUG_SSD: True
29
+ SIZE_DIVISIBILITY: 640 # used in dataset mapper
30
+ FORMAT: "RGB"
31
+ TEST:
32
+ EVAL_PERIOD: 5000
33
+ AUG:
34
+ ENABLED: False
35
+ MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36
+ MAX_SIZE: 4480
37
+ FLIP: True
configs/ade20k/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_160k_res640.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_160k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 128
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [4, 8, 16, 32]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
17
+ INPUT:
18
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
20
+ MIN_SIZE_TEST: 640
21
+ MAX_SIZE_TRAIN: 2560
22
+ MAX_SIZE_TEST: 2560
23
+ CROP:
24
+ ENABLED: True
25
+ TYPE: "absolute"
26
+ SIZE: (640, 640)
27
+ SINGLE_CATEGORY_MAX_AREA: 1.0
28
+ COLOR_AUG_SSD: True
29
+ SIZE_DIVISIBILITY: 640 # used in dataset mapper
30
+ FORMAT: "RGB"
31
+ TEST:
32
+ EVAL_PERIOD: 5000
33
+ AUG:
34
+ ENABLED: False
35
+ MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36
+ MAX_SIZE: 4480
37
+ FLIP: True
configs/ade20k/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_160k_res640.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_160k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 192
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [6, 12, 24, 48]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
17
+ INPUT:
18
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
20
+ MIN_SIZE_TEST: 640
21
+ MAX_SIZE_TRAIN: 2560
22
+ MAX_SIZE_TEST: 2560
23
+ CROP:
24
+ ENABLED: True
25
+ TYPE: "absolute"
26
+ SIZE: (640, 640)
27
+ SINGLE_CATEGORY_MAX_AREA: 1.0
28
+ COLOR_AUG_SSD: True
29
+ SIZE_DIVISIBILITY: 640 # used in dataset mapper
30
+ FORMAT: "RGB"
31
+ TEST:
32
+ EVAL_PERIOD: 5000
33
+ AUG:
34
+ ENABLED: False
35
+ MIN_SIZES: [320, 480, 640, 800, 960, 1120]
36
+ MAX_SIZE: 4480
37
+ FLIP: True
configs/ade20k/semantic-segmentation/swin/maskformer2_swin_small_bs16_160k.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_160k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 96
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [3, 6, 12, 24]
9
+ WINDOW_SIZE: 7
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ WEIGHTS: "swin_small_patch4_window7_224.pkl"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
configs/ade20k/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_160k.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_160k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 96
7
+ DEPTHS: [2, 2, 6, 2]
8
+ NUM_HEADS: [3, 6, 12, 24]
9
+ WINDOW_SIZE: 7
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
configs/cityscapes/instance-segmentation/Base-Cityscapes-InstanceSegmentation.yaml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ BACKBONE:
3
+ FREEZE_AT: 0
4
+ NAME: "build_resnet_backbone"
5
+ WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
7
+ PIXEL_STD: [58.395, 57.120, 57.375]
8
+ RESNETS:
9
+ DEPTH: 50
10
+ STEM_TYPE: "basic" # not used
11
+ STEM_OUT_CHANNELS: 64
12
+ STRIDE_IN_1X1: False
13
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14
+ NORM: "SyncBN" # use syncbn for cityscapes dataset
15
+ RES5_MULTI_GRID: [1, 1, 1] # not used
16
+ DATASETS:
17
+ TRAIN: ("cityscapes_fine_instance_seg_train",)
18
+ TEST: ("cityscapes_fine_instance_seg_val",)
19
+ SOLVER:
20
+ IMS_PER_BATCH: 16
21
+ BASE_LR: 0.0001
22
+ MAX_ITER: 90000
23
+ WARMUP_FACTOR: 1.0
24
+ WARMUP_ITERS: 0
25
+ WEIGHT_DECAY: 0.05
26
+ OPTIMIZER: "ADAMW"
27
+ LR_SCHEDULER_NAME: "WarmupPolyLR"
28
+ BACKBONE_MULTIPLIER: 0.1
29
+ CLIP_GRADIENTS:
30
+ ENABLED: True
31
+ CLIP_TYPE: "full_model"
32
+ CLIP_VALUE: 0.01
33
+ NORM_TYPE: 2.0
34
+ AMP:
35
+ ENABLED: True
36
+ INPUT:
37
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
39
+ MIN_SIZE_TEST: 1024
40
+ MAX_SIZE_TRAIN: 4096
41
+ MAX_SIZE_TEST: 2048
42
+ CROP:
43
+ ENABLED: True
44
+ TYPE: "absolute"
45
+ SIZE: (512, 1024)
46
+ SINGLE_CATEGORY_MAX_AREA: 1.0
47
+ COLOR_AUG_SSD: True
48
+ SIZE_DIVISIBILITY: -1
49
+ FORMAT: "RGB"
50
+ DATASET_MAPPER_NAME: "mask_former_instance"
51
+ TEST:
52
+ EVAL_PERIOD: 5000
53
+ AUG:
54
+ ENABLED: False
55
+ MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56
+ MAX_SIZE: 4096
57
+ FLIP: True
58
+ DATALOADER:
59
+ FILTER_EMPTY_ANNOTATIONS: True
60
+ NUM_WORKERS: 4
61
+ VERSION: 2
configs/cityscapes/instance-segmentation/maskformer2_R101_bs16_90k.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: maskformer2_R50_bs16_90k.yaml
2
+ MODEL:
3
+ WEIGHTS: "R-101.pkl"
4
+ RESNETS:
5
+ DEPTH: 101
6
+ STEM_TYPE: "basic" # not used
7
+ STEM_OUT_CHANNELS: 64
8
+ STRIDE_IN_1X1: False
9
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10
+ NORM: "SyncBN"
11
+ RES5_MULTI_GRID: [1, 1, 1] # not used
configs/cityscapes/instance-segmentation/maskformer2_R50_bs16_90k.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: Base-Cityscapes-InstanceSegmentation.yaml
2
+ MODEL:
3
+ META_ARCHITECTURE: "MaskFormer"
4
+ SEM_SEG_HEAD:
5
+ NAME: "MaskFormerHead"
6
+ IGNORE_VALUE: 255
7
+ NUM_CLASSES: 8
8
+ LOSS_WEIGHT: 1.0
9
+ CONVS_DIM: 256
10
+ MASK_DIM: 256
11
+ NORM: "GN"
12
+ # pixel decoder
13
+ PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
15
+ DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16
+ COMMON_STRIDE: 4
17
+ TRANSFORMER_ENC_LAYERS: 6
18
+ MASK_FORMER:
19
+ TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20
+ TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21
+ DEEP_SUPERVISION: True
22
+ NO_OBJECT_WEIGHT: 0.1
23
+ CLASS_WEIGHT: 2.0
24
+ MASK_WEIGHT: 5.0
25
+ DICE_WEIGHT: 5.0
26
+ HIDDEN_DIM: 256
27
+ NUM_OBJECT_QUERIES: 100
28
+ NHEADS: 8
29
+ DROPOUT: 0.0
30
+ DIM_FEEDFORWARD: 2048
31
+ ENC_LAYERS: 0
32
+ PRE_NORM: False
33
+ ENFORCE_INPUT_PROJ: False
34
+ SIZE_DIVISIBILITY: 32
35
+ DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
36
+ TRAIN_NUM_POINTS: 12544
37
+ OVERSAMPLE_RATIO: 3.0
38
+ IMPORTANCE_SAMPLE_RATIO: 0.75
39
+ TEST:
40
+ SEMANTIC_ON: False
41
+ INSTANCE_ON: True
42
+ PANOPTIC_ON: False
43
+ OVERLAP_THRESHOLD: 0.8
44
+ OBJECT_MASK_THRESHOLD: 0.8
configs/cityscapes/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_90k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 128
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [4, 8, 16, 32]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
configs/cityscapes/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_90k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 192
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [6, 12, 24, 48]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
17
+ MASK_FORMER:
18
+ NUM_OBJECT_QUERIES: 200
configs/cityscapes/instance-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_90k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 96
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [3, 6, 12, 24]
9
+ WINDOW_SIZE: 7
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ WEIGHTS: "swin_small_patch4_window7_224.pkl"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
configs/cityscapes/instance-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_90k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 96
7
+ DEPTHS: [2, 2, 6, 2]
8
+ NUM_HEADS: [3, 6, 12, 24]
9
+ WINDOW_SIZE: 7
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
configs/cityscapes/panoptic-segmentation/Base-Cityscapes-PanopticSegmentation.yaml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ BACKBONE:
3
+ FREEZE_AT: 0
4
+ NAME: "build_resnet_backbone"
5
+ WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
7
+ PIXEL_STD: [58.395, 57.120, 57.375]
8
+ RESNETS:
9
+ DEPTH: 50
10
+ STEM_TYPE: "basic" # not used
11
+ STEM_OUT_CHANNELS: 64
12
+ STRIDE_IN_1X1: False
13
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14
+ NORM: "SyncBN" # use syncbn for cityscapes dataset
15
+ RES5_MULTI_GRID: [1, 1, 1] # not used
16
+ DATASETS:
17
+ TRAIN: ("cityscapes_fine_panoptic_train",)
18
+ TEST: ("cityscapes_fine_panoptic_val",)
19
+ SOLVER:
20
+ IMS_PER_BATCH: 16
21
+ BASE_LR: 0.0001
22
+ MAX_ITER: 90000
23
+ WARMUP_FACTOR: 1.0
24
+ WARMUP_ITERS: 0
25
+ WEIGHT_DECAY: 0.05
26
+ OPTIMIZER: "ADAMW"
27
+ LR_SCHEDULER_NAME: "WarmupPolyLR"
28
+ BACKBONE_MULTIPLIER: 0.1
29
+ CLIP_GRADIENTS:
30
+ ENABLED: True
31
+ CLIP_TYPE: "full_model"
32
+ CLIP_VALUE: 0.01
33
+ NORM_TYPE: 2.0
34
+ AMP:
35
+ ENABLED: True
36
+ INPUT:
37
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
39
+ MIN_SIZE_TEST: 1024
40
+ MAX_SIZE_TRAIN: 4096
41
+ MAX_SIZE_TEST: 2048
42
+ CROP:
43
+ ENABLED: True
44
+ TYPE: "absolute"
45
+ SIZE: (512, 1024)
46
+ SINGLE_CATEGORY_MAX_AREA: 1.0
47
+ COLOR_AUG_SSD: True
48
+ SIZE_DIVISIBILITY: -1
49
+ FORMAT: "RGB"
50
+ DATASET_MAPPER_NAME: "mask_former_panoptic"
51
+ TEST:
52
+ EVAL_PERIOD: 5000
53
+ AUG:
54
+ ENABLED: False
55
+ MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56
+ MAX_SIZE: 4096
57
+ FLIP: True
58
+ DATALOADER:
59
+ FILTER_EMPTY_ANNOTATIONS: True
60
+ NUM_WORKERS: 4
61
+ VERSION: 2
configs/cityscapes/panoptic-segmentation/maskformer2_R101_bs16_90k.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: maskformer2_R50_bs16_90k.yaml
2
+ MODEL:
3
+ WEIGHTS: "R-101.pkl"
4
+ RESNETS:
5
+ DEPTH: 101
6
+ STEM_TYPE: "basic" # not used
7
+ STEM_OUT_CHANNELS: 64
8
+ STRIDE_IN_1X1: False
9
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10
+ NORM: "SyncBN"
11
+ RES5_MULTI_GRID: [1, 1, 1] # not used
configs/cityscapes/panoptic-segmentation/maskformer2_R50_bs16_90k.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: Base-Cityscapes-PanopticSegmentation.yaml
2
+ MODEL:
3
+ META_ARCHITECTURE: "MaskFormer"
4
+ SEM_SEG_HEAD:
5
+ NAME: "MaskFormerHead"
6
+ IGNORE_VALUE: 255
7
+ NUM_CLASSES: 19
8
+ LOSS_WEIGHT: 1.0
9
+ CONVS_DIM: 256
10
+ MASK_DIM: 256
11
+ NORM: "GN"
12
+ # pixel decoder
13
+ PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
15
+ DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16
+ COMMON_STRIDE: 4
17
+ TRANSFORMER_ENC_LAYERS: 6
18
+ MASK_FORMER:
19
+ TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20
+ TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21
+ DEEP_SUPERVISION: True
22
+ NO_OBJECT_WEIGHT: 0.1
23
+ CLASS_WEIGHT: 2.0
24
+ MASK_WEIGHT: 5.0
25
+ DICE_WEIGHT: 5.0
26
+ HIDDEN_DIM: 256
27
+ NUM_OBJECT_QUERIES: 100
28
+ NHEADS: 8
29
+ DROPOUT: 0.0
30
+ DIM_FEEDFORWARD: 2048
31
+ ENC_LAYERS: 0
32
+ PRE_NORM: False
33
+ ENFORCE_INPUT_PROJ: False
34
+ SIZE_DIVISIBILITY: 32
35
+ DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
36
+ TRAIN_NUM_POINTS: 12544
37
+ OVERSAMPLE_RATIO: 3.0
38
+ IMPORTANCE_SAMPLE_RATIO: 0.75
39
+ TEST:
40
+ SEMANTIC_ON: True
41
+ INSTANCE_ON: True
42
+ PANOPTIC_ON: True
43
+ OVERLAP_THRESHOLD: 0.8
44
+ OBJECT_MASK_THRESHOLD: 0.8
configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_90k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 128
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [4, 8, 16, 32]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_90k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 192
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [6, 12, 24, 48]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
17
+ MASK_FORMER:
18
+ NUM_OBJECT_QUERIES: 200
configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_90k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 96
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [3, 6, 12, 24]
9
+ WINDOW_SIZE: 7
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ WEIGHTS: "swin_small_patch4_window7_224.pkl"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
configs/cityscapes/panoptic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_90k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 96
7
+ DEPTHS: [2, 2, 6, 2]
8
+ NUM_HEADS: [3, 6, 12, 24]
9
+ WINDOW_SIZE: 7
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
configs/cityscapes/semantic-segmentation/Base-Cityscapes-SemanticSegmentation.yaml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ BACKBONE:
3
+ FREEZE_AT: 0
4
+ NAME: "build_resnet_backbone"
5
+ WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
7
+ PIXEL_STD: [58.395, 57.120, 57.375]
8
+ RESNETS:
9
+ DEPTH: 50
10
+ STEM_TYPE: "basic" # not used
11
+ STEM_OUT_CHANNELS: 64
12
+ STRIDE_IN_1X1: False
13
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14
+ NORM: "SyncBN" # use syncbn for cityscapes dataset
15
+ RES5_MULTI_GRID: [1, 1, 1] # not used
16
+ DATASETS:
17
+ TRAIN: ("cityscapes_fine_sem_seg_train",)
18
+ TEST: ("cityscapes_fine_sem_seg_val",)
19
+ SOLVER:
20
+ IMS_PER_BATCH: 16
21
+ BASE_LR: 0.0001
22
+ MAX_ITER: 90000
23
+ WARMUP_FACTOR: 1.0
24
+ WARMUP_ITERS: 0
25
+ WEIGHT_DECAY: 0.05
26
+ OPTIMIZER: "ADAMW"
27
+ LR_SCHEDULER_NAME: "WarmupPolyLR"
28
+ BACKBONE_MULTIPLIER: 0.1
29
+ CLIP_GRADIENTS:
30
+ ENABLED: True
31
+ CLIP_TYPE: "full_model"
32
+ CLIP_VALUE: 0.01
33
+ NORM_TYPE: 2.0
34
+ AMP:
35
+ ENABLED: True
36
+ INPUT:
37
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
38
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
39
+ MIN_SIZE_TEST: 1024
40
+ MAX_SIZE_TRAIN: 4096
41
+ MAX_SIZE_TEST: 2048
42
+ CROP:
43
+ ENABLED: True
44
+ TYPE: "absolute"
45
+ SIZE: (512, 1024)
46
+ SINGLE_CATEGORY_MAX_AREA: 1.0
47
+ COLOR_AUG_SSD: True
48
+ SIZE_DIVISIBILITY: -1
49
+ FORMAT: "RGB"
50
+ DATASET_MAPPER_NAME: "mask_former_semantic"
51
+ TEST:
52
+ EVAL_PERIOD: 5000
53
+ AUG:
54
+ ENABLED: False
55
+ MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
56
+ MAX_SIZE: 4096
57
+ FLIP: True
58
+ DATALOADER:
59
+ FILTER_EMPTY_ANNOTATIONS: True
60
+ NUM_WORKERS: 4
61
+ VERSION: 2
configs/cityscapes/semantic-segmentation/maskformer2_R101_bs16_90k.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: maskformer2_R50_bs16_90k.yaml
2
+ MODEL:
3
+ WEIGHTS: "R-101.pkl"
4
+ RESNETS:
5
+ DEPTH: 101
6
+ STEM_TYPE: "basic" # not used
7
+ STEM_OUT_CHANNELS: 64
8
+ STRIDE_IN_1X1: False
9
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10
+ NORM: "SyncBN"
11
+ RES5_MULTI_GRID: [1, 1, 1] # not used
configs/cityscapes/semantic-segmentation/maskformer2_R50_bs16_90k.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: Base-Cityscapes-SemanticSegmentation.yaml
2
+ MODEL:
3
+ META_ARCHITECTURE: "MaskFormer"
4
+ SEM_SEG_HEAD:
5
+ NAME: "MaskFormerHead"
6
+ IGNORE_VALUE: 255
7
+ NUM_CLASSES: 19
8
+ LOSS_WEIGHT: 1.0
9
+ CONVS_DIM: 256
10
+ MASK_DIM: 256
11
+ NORM: "GN"
12
+ # pixel decoder
13
+ PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
15
+ DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16
+ COMMON_STRIDE: 4
17
+ TRANSFORMER_ENC_LAYERS: 6
18
+ MASK_FORMER:
19
+ TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20
+ TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21
+ DEEP_SUPERVISION: True
22
+ NO_OBJECT_WEIGHT: 0.1
23
+ CLASS_WEIGHT: 2.0
24
+ MASK_WEIGHT: 5.0
25
+ DICE_WEIGHT: 5.0
26
+ HIDDEN_DIM: 256
27
+ NUM_OBJECT_QUERIES: 100
28
+ NHEADS: 8
29
+ DROPOUT: 0.0
30
+ DIM_FEEDFORWARD: 2048
31
+ ENC_LAYERS: 0
32
+ PRE_NORM: False
33
+ ENFORCE_INPUT_PROJ: False
34
+ SIZE_DIVISIBILITY: 32
35
+ DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
36
+ TRAIN_NUM_POINTS: 12544
37
+ OVERSAMPLE_RATIO: 3.0
38
+ IMPORTANCE_SAMPLE_RATIO: 0.75
39
+ TEST:
40
+ SEMANTIC_ON: True
41
+ INSTANCE_ON: False
42
+ PANOPTIC_ON: False
43
+ OVERLAP_THRESHOLD: 0.8
44
+ OBJECT_MASK_THRESHOLD: 0.8
configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_90k.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_90k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 128
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [4, 8, 16, 32]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_90k.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_90k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 192
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [6, 12, 24, 48]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
17
+ MASK_FORMER:
18
+ NUM_OBJECT_QUERIES: 100
configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_small_bs16_90k.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_90k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 96
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [3, 6, 12, 24]
9
+ WINDOW_SIZE: 7
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ WEIGHTS: "swin_small_patch4_window7_224.pkl"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
configs/cityscapes/semantic-segmentation/swin/maskformer2_swin_tiny_bs16_90k.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_90k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 96
7
+ DEPTHS: [2, 2, 6, 2]
8
+ NUM_HEADS: [3, 6, 12, 24]
9
+ WINDOW_SIZE: 7
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
configs/coco/instance-segmentation/Base-COCO-InstanceSegmentation.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ BACKBONE:
3
+ FREEZE_AT: 0
4
+ NAME: "build_resnet_backbone"
5
+ WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
7
+ PIXEL_STD: [58.395, 57.120, 57.375]
8
+ RESNETS:
9
+ DEPTH: 50
10
+ STEM_TYPE: "basic" # not used
11
+ STEM_OUT_CHANNELS: 64
12
+ STRIDE_IN_1X1: False
13
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14
+ # NORM: "SyncBN"
15
+ RES5_MULTI_GRID: [1, 1, 1] # not used
16
+ DATASETS:
17
+ TRAIN: ("coco_2017_train",)
18
+ TEST: ("coco_2017_val",)
19
+ SOLVER:
20
+ IMS_PER_BATCH: 16
21
+ BASE_LR: 0.0001
22
+ STEPS: (327778, 355092)
23
+ MAX_ITER: 368750
24
+ WARMUP_FACTOR: 1.0
25
+ WARMUP_ITERS: 10
26
+ WEIGHT_DECAY: 0.05
27
+ OPTIMIZER: "ADAMW"
28
+ BACKBONE_MULTIPLIER: 0.1
29
+ CLIP_GRADIENTS:
30
+ ENABLED: True
31
+ CLIP_TYPE: "full_model"
32
+ CLIP_VALUE: 0.01
33
+ NORM_TYPE: 2.0
34
+ AMP:
35
+ ENABLED: True
36
+ INPUT:
37
+ IMAGE_SIZE: 1024
38
+ MIN_SCALE: 0.1
39
+ MAX_SCALE: 2.0
40
+ FORMAT: "RGB"
41
+ DATASET_MAPPER_NAME: "coco_instance_lsj"
42
+ TEST:
43
+ EVAL_PERIOD: 5000
44
+ DATALOADER:
45
+ FILTER_EMPTY_ANNOTATIONS: True
46
+ NUM_WORKERS: 4
47
+ VERSION: 2
configs/coco/instance-segmentation/maskformer2_R101_bs16_50ep.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: maskformer2_R50_bs16_50ep.yaml
2
+ MODEL:
3
+ WEIGHTS: "R-101.pkl"
4
+ RESNETS:
5
+ DEPTH: 101
6
+ STEM_TYPE: "basic" # not used
7
+ STEM_OUT_CHANNELS: 64
8
+ STRIDE_IN_1X1: False
9
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10
+ # NORM: "SyncBN"
11
+ RES5_MULTI_GRID: [1, 1, 1] # not used
configs/coco/instance-segmentation/maskformer2_R50_bs16_50ep.yaml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: Base-COCO-InstanceSegmentation.yaml
2
+ MODEL:
3
+ META_ARCHITECTURE: "MaskFormer"
4
+ SEM_SEG_HEAD:
5
+ NAME: "MaskFormerHead"
6
+ IGNORE_VALUE: 255
7
+ NUM_CLASSES: 80
8
+ LOSS_WEIGHT: 1.0
9
+ CONVS_DIM: 256
10
+ MASK_DIM: 256
11
+ NORM: "GN"
12
+ # pixel decoder
13
+ PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
15
+ DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16
+ COMMON_STRIDE: 4
17
+ TRANSFORMER_ENC_LAYERS: 6
18
+ MASK_FORMER:
19
+ TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
20
+ TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21
+ DEEP_SUPERVISION: True
22
+ NO_OBJECT_WEIGHT: 0.1
23
+ CLASS_WEIGHT: 2.0
24
+ MASK_WEIGHT: 5.0
25
+ DICE_WEIGHT: 5.0
26
+ HIDDEN_DIM: 256
27
+ NUM_OBJECT_QUERIES: 100
28
+ NHEADS: 8
29
+ DROPOUT: 0.0
30
+ DIM_FEEDFORWARD: 2048
31
+ ENC_LAYERS: 0
32
+ PRE_NORM: False
33
+ ENFORCE_INPUT_PROJ: False
34
+ SIZE_DIVISIBILITY: 32
35
+ DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
36
+ TRAIN_NUM_POINTS: 12544
37
+ OVERSAMPLE_RATIO: 3.0
38
+ IMPORTANCE_SAMPLE_RATIO: 0.75
39
+ TEST:
40
+ SEMANTIC_ON: False
41
+ INSTANCE_ON: True
42
+ PANOPTIC_ON: False
43
+ OVERLAP_THRESHOLD: 0.8
44
+ OBJECT_MASK_THRESHOLD: 0.8
configs/coco/instance-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_50ep.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 128
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [4, 8, 16, 32]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_base_patch4_window12_384.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
configs/coco/instance-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_50ep.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 128
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [4, 8, 16, 32]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
configs/coco/instance-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_50ep.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 192
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [6, 12, 24, 48]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
17
+ MASK_FORMER:
18
+ NUM_OBJECT_QUERIES: 200
19
+ SOLVER:
20
+ STEPS: (655556, 710184)
21
+ MAX_ITER: 737500
configs/coco/instance-segmentation/swin/maskformer2_swin_small_bs16_50ep.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_50ep.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 96
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [3, 6, 12, 24]
9
+ WINDOW_SIZE: 7
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ WEIGHTS: "swin_small_patch4_window7_224.pkl"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
configs/coco/instance-segmentation/swin/maskformer2_swin_tiny_bs16_50ep.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_50ep.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 96
7
+ DEPTHS: [2, 2, 6, 2]
8
+ NUM_HEADS: [3, 6, 12, 24]
9
+ WINDOW_SIZE: 7
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
configs/coco/panoptic-segmentation/Base-COCO-PanopticSegmentation.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ BACKBONE:
3
+ FREEZE_AT: 0
4
+ NAME: "build_resnet_backbone"
5
+ WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
7
+ PIXEL_STD: [58.395, 57.120, 57.375]
8
+ RESNETS:
9
+ DEPTH: 50
10
+ STEM_TYPE: "basic" # not used
11
+ STEM_OUT_CHANNELS: 64
12
+ STRIDE_IN_1X1: False
13
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14
+ # NORM: "SyncBN"
15
+ RES5_MULTI_GRID: [1, 1, 1] # not used
16
+ DATASETS:
17
+ TRAIN: ("coco_2017_train_panoptic",)
18
+ TEST: ("coco_2017_val_panoptic_with_sem_seg",) # to evaluate instance and semantic performance as well
19
+ SOLVER:
20
+ IMS_PER_BATCH: 16
21
+ BASE_LR: 0.0001
22
+ STEPS: (327778, 355092)
23
+ MAX_ITER: 368750
24
+ WARMUP_FACTOR: 1.0
25
+ WARMUP_ITERS: 10
26
+ WEIGHT_DECAY: 0.05
27
+ OPTIMIZER: "ADAMW"
28
+ BACKBONE_MULTIPLIER: 0.1
29
+ CLIP_GRADIENTS:
30
+ ENABLED: True
31
+ CLIP_TYPE: "full_model"
32
+ CLIP_VALUE: 0.01
33
+ NORM_TYPE: 2.0
34
+ AMP:
35
+ ENABLED: True
36
+ INPUT:
37
+ IMAGE_SIZE: 1024
38
+ MIN_SCALE: 0.1
39
+ MAX_SCALE: 2.0
40
+ FORMAT: "RGB"
41
+ DATASET_MAPPER_NAME: "coco_panoptic_lsj"
42
+ TEST:
43
+ EVAL_PERIOD: 5000
44
+ DATALOADER:
45
+ FILTER_EMPTY_ANNOTATIONS: True
46
+ NUM_WORKERS: 4
47
+ VERSION: 2
configs/coco/panoptic-segmentation/maskformer2_R101_bs16_50ep.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: maskformer2_R50_bs16_50ep.yaml
2
+ MODEL:
3
+ WEIGHTS: "R-101.pkl"
4
+ RESNETS:
5
+ DEPTH: 101
6
+ STEM_TYPE: "basic" # not used
7
+ STEM_OUT_CHANNELS: 64
8
+ STRIDE_IN_1X1: False
9
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
10
+ # NORM: "SyncBN"
11
+ RES5_MULTI_GRID: [1, 1, 1] # not used
configs/coco/panoptic-segmentation/maskformer2_R50_bs16_50ep.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: Base-COCO-PanopticSegmentation.yaml
2
+ MODEL:
3
+ META_ARCHITECTURE: "MaskFormer"
4
+ SEM_SEG_HEAD:
5
+ NAME: "MaskFormerHead"
6
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
7
+ IGNORE_VALUE: 255
8
+ NUM_CLASSES: 133
9
+ LOSS_WEIGHT: 1.0
10
+ CONVS_DIM: 256
11
+ MASK_DIM: 256
12
+ NORM: "GN"
13
+ # pixel decoder
14
+ PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
15
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
16
+ DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
17
+ COMMON_STRIDE: 4
18
+ TRANSFORMER_ENC_LAYERS: 6
19
+ MASK_FORMER:
20
+ TRANSFORMER_DECODER_NAME: "MultiScaleMaskedTransformerDecoder"
21
+ TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
22
+ DEEP_SUPERVISION: True
23
+ NO_OBJECT_WEIGHT: 0.1
24
+ CLASS_WEIGHT: 2.0
25
+ MASK_WEIGHT: 5.0
26
+ DICE_WEIGHT: 5.0
27
+ HIDDEN_DIM: 256
28
+ NUM_OBJECT_QUERIES: 100
29
+ NHEADS: 8
30
+ DROPOUT: 0.0
31
+ DIM_FEEDFORWARD: 2048
32
+ ENC_LAYERS: 0
33
+ PRE_NORM: False
34
+ ENFORCE_INPUT_PROJ: False
35
+ SIZE_DIVISIBILITY: 32
36
+ DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
37
+ TRAIN_NUM_POINTS: 12544
38
+ OVERSAMPLE_RATIO: 3.0
39
+ IMPORTANCE_SAMPLE_RATIO: 0.75
40
+ TEST:
41
+ SEMANTIC_ON: True
42
+ INSTANCE_ON: True
43
+ PANOPTIC_ON: True
44
+ OVERLAP_THRESHOLD: 0.8
45
+ OBJECT_MASK_THRESHOLD: 0.8
configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_384_bs16_50ep.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_50ep.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 128
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [4, 8, 16, 32]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_base_patch4_window12_384.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
configs/coco/panoptic-segmentation/swin/maskformer2_swin_base_IN21k_384_bs16_50ep.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_50ep.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 128
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [4, 8, 16, 32]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
configs/coco/panoptic-segmentation/swin/maskformer2_swin_large_IN21k_384_bs16_100ep.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../maskformer2_R50_bs16_50ep.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 192
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [6, 12, 24, 48]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
17
+ MASK_FORMER:
18
+ NUM_OBJECT_QUERIES: 200
19
+ SOLVER:
20
+ STEPS: (655556, 710184)
21
+ MAX_ITER: 737500