vidit98 commited on
Commit
2171e8f
0 Parent(s):

demo files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +34 -0
  2. .gitignore +45 -0
  3. Dockerfile +68 -0
  4. README.md +10 -0
  5. annotator/OneFormer/__init__.py +61 -0
  6. annotator/OneFormer/configs/ade20k/Base-ADE20K-UnifiedSegmentation.yaml +68 -0
  7. annotator/OneFormer/configs/ade20k/convnext/oneformer_convnext_large_bs16_160k.yaml +38 -0
  8. annotator/OneFormer/configs/ade20k/convnext/oneformer_convnext_xlarge_bs16_160k.yaml +38 -0
  9. annotator/OneFormer/configs/ade20k/dinat/coco_pretrain_oneformer_dinat_large_bs16_160k_1280x1280.yaml +42 -0
  10. annotator/OneFormer/configs/ade20k/dinat/oneformer_dinat_large_bs16_160k.yaml +42 -0
  11. annotator/OneFormer/configs/ade20k/dinat/oneformer_dinat_large_bs16_160k_1280x1280.yaml +42 -0
  12. annotator/OneFormer/configs/ade20k/dinat/oneformer_dinat_large_bs16_160k_896x896.yaml +42 -0
  13. annotator/OneFormer/configs/ade20k/oneformer_R50_bs16_160k.yaml +58 -0
  14. annotator/OneFormer/configs/ade20k/swin/oneformer_swin_large_bs16_160k.yaml +40 -0
  15. annotator/OneFormer/configs/ade20k/swin/oneformer_swin_large_bs16_160k_1280x1280.yaml +40 -0
  16. annotator/OneFormer/configs/ade20k/swin/oneformer_swin_large_bs16_160k_896x896.yaml +40 -0
  17. annotator/OneFormer/configs/ade20k/swin/oneformer_swin_tiny_bs16_160k.yaml +15 -0
  18. annotator/OneFormer/configs/cityscapes/Base-Cityscapes-UnifiedSegmentation.yaml +68 -0
  19. annotator/OneFormer/configs/cityscapes/convnext/mapillary_pretrain_oneformer_convnext_large_bs16_90k.yaml +18 -0
  20. annotator/OneFormer/configs/cityscapes/convnext/oneformer_convnext_large_bs16_90k.yaml +18 -0
  21. annotator/OneFormer/configs/cityscapes/convnext/oneformer_convnext_xlarge_bs16_90k.yaml +18 -0
  22. annotator/OneFormer/configs/cityscapes/dinat/oneformer_dinat_large_bs16_90k.yaml +22 -0
  23. annotator/OneFormer/configs/cityscapes/oneformer_R50_bs16_90k.yaml +59 -0
  24. annotator/OneFormer/configs/cityscapes/swin/oneformer_swin_large_bs16_90k.yaml +20 -0
  25. annotator/OneFormer/configs/coco/Base-COCO-UnifiedSegmentation.yaml +54 -0
  26. annotator/OneFormer/configs/coco/dinat/oneformer_dinat_large_bs16_100ep.yaml +24 -0
  27. annotator/OneFormer/configs/coco/oneformer_R50_bs16_50ep.yaml +59 -0
  28. annotator/OneFormer/configs/coco/swin/oneformer_swin_large_bs16_100ep.yaml +28 -0
  29. annotator/OneFormer/configs/coco/swin/oneformer_swin_tiny_bs16_50ep.yaml +15 -0
  30. annotator/OneFormer/configs/mapillary_vistas/Base-Mapillary-UnifiedSegmentation.yaml +68 -0
  31. annotator/OneFormer/configs/mapillary_vistas/convnext/cityscapes_pretrain_oneformer_convnext_large_bs16_300k.yaml +22 -0
  32. annotator/OneFormer/configs/mapillary_vistas/convnext/oneformer_convnext_large_bs16_300k.yaml +18 -0
  33. annotator/OneFormer/configs/mapillary_vistas/dinat/oneformer_dinat_large_bs16_300k.yaml +22 -0
  34. annotator/OneFormer/configs/mapillary_vistas/oneformer_R50_bs16_300k.yaml +59 -0
  35. annotator/OneFormer/configs/mapillary_vistas/swin/oneformer_swin_large_bs16_300k.yaml +20 -0
  36. annotator/OneFormer/datasets/README.md +168 -0
  37. annotator/OneFormer/datasets/ade20k_instance_catid_mapping.txt +104 -0
  38. annotator/OneFormer/datasets/custom_datasets/README.md +35 -0
  39. annotator/OneFormer/datasets/custom_datasets/instance_coco_custom_dataset_mapper.py +235 -0
  40. annotator/OneFormer/datasets/custom_datasets/instance_oneformer_custom_dataset_mapper.py +245 -0
  41. annotator/OneFormer/datasets/custom_datasets/semantic_oneformer_custom_dataset_mapper.py +238 -0
  42. annotator/OneFormer/datasets/fg_ids.py +108 -0
  43. annotator/OneFormer/datasets/panoptic2detection_coco_format.py +152 -0
  44. annotator/OneFormer/datasets/prepare_ade20k_ins_seg.py +112 -0
  45. annotator/OneFormer/datasets/prepare_ade20k_pan_seg.py +500 -0
  46. annotator/OneFormer/datasets/prepare_ade20k_sem_seg.py +27 -0
  47. annotator/OneFormer/datasets/prepare_coco_semantic_annos_from_panoptic_annos.py +84 -0
  48. annotator/OneFormer/demo/defaults.py +82 -0
  49. annotator/OneFormer/oneformer/__init__.py +18 -0
  50. annotator/OneFormer/oneformer/config.py +210 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *_video
2
+ *_video.py
3
+ extras/
4
+
5
+ # output dir
6
+ .DS_Store
7
+ output
8
+
9
+ *.json
10
+ *.diff
11
+ *.jpg
12
+ *.ckpt
13
+
14
+ # compilation and distribution
15
+ __pycache__
16
+ _ext
17
+ *.pyc
18
+ *.pyd
19
+ *.so
20
+ *.dll
21
+ *.egg-info/
22
+ build/
23
+ dist/
24
+ wheels/
25
+
26
+ # pytorch/python/numpy formats
27
+ *.pth
28
+ *.pkl
29
+ *.npy
30
+ *.ts
31
+ model_ts*.txt
32
+
33
+ # ipython/jupyter notebooks
34
+ **/.ipynb_checkpoints/
35
+
36
+ # Editor temporaries
37
+ *.swn
38
+ *.swo
39
+ *.swp
40
+ *~
41
+
42
+ # editor settings
43
+ .idea
44
+ .vscode
45
+ _darcs
Dockerfile ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.3.1-cudnn8-devel-ubuntu18.04
2
+ CMD nvidia-smi
3
+
4
+ ENV DEBIAN_FRONTEND noninteractive
5
+ RUN apt-get update && apt-get install -y \
6
+ git \
7
+ make build-essential libssl-dev zlib1g-dev \
8
+ libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm \
9
+ libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev \
10
+ ffmpeg libsm6 libxext6 cmake libgl1-mesa-glx \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ RUN useradd -ms /bin/bash user
14
+ USER user
15
+
16
+ ENV HOME=/home/user \
17
+ PATH=/home/user/.local/bin:$PATH
18
+
19
+ RUN curl https://pyenv.run | bash
20
+ ENV PATH=$HOME/.pyenv/shims:$HOME/.pyenv/bin:$PATH
21
+ RUN pyenv install 3.8.15 && \
22
+ pyenv global 3.8.15 && \
23
+ pyenv rehash && \
24
+ pip install --no-cache-dir --upgrade pip setuptools wheel
25
+
26
+ ENV WORKDIR=/code
27
+ WORKDIR $WORKDIR
28
+ RUN chown -R user:user $WORKDIR
29
+ RUN chmod -R 777 $WORKDIR
30
+
31
+ COPY requirements.txt $WORKDIR/requirements.txt
32
+ RUN pip install --no-cache-dir --upgrade -r $WORKDIR/requirements.txt
33
+ RUN pip install ninja
34
+
35
+ COPY . .
36
+
37
+ ARG TORCH_CUDA_ARCH_LIST=7.5+PTX
38
+
39
+ USER root
40
+ RUN chown -R user:user $HOME
41
+ RUN chmod -R 777 $HOME
42
+ RUN chown -R user:user $WORKDIR
43
+ RUN chmod -R 777 $WORKDIR
44
+
45
+ USER user
46
+ RUN ln -s $WORKDIR/annotator/OneFormer/oneformer/modeling/pixel_decoder/ops $WORKDIR/ && ls
47
+ RUN cd ops/ && FORCE_CUDA=1 python setup.py build --build-base=$WORKDIR/ install --user && cd ..
48
+ RUN sh deform_setup.sh
49
+
50
+ USER user
51
+ RUN sh deform_setup.sh
52
+
53
+ USER user
54
+
55
+ ENV PYTHONPATH=${HOME}/app \
56
+ PYTHONUNBUFFERED=1 \
57
+ GRADIO_ALLOW_FLAGGING=never \
58
+ GRADIO_NUM_PORTS=1 \
59
+ GRADIO_SERVER_NAME=0.0.0.0 \
60
+ GRADIO_THEME=huggingface \
61
+ SYSTEM=spaces
62
+
63
+ RUN --mount=type=secret,id=ACCESS_TOKEN,mode=0444,required=true
64
+
65
+
66
+ EXPOSE 7860
67
+
68
+ ENTRYPOINT ["python", "app.py"]
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: PAIR Diffusion
3
+ emoji: 📚
4
+ colorFrom: purple
5
+ colorTo: gray
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
annotator/OneFormer/__init__.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------
2
+ # Reference: https://github.com/SHI-Labs/OneFormer
3
+ # Modified by Vidit Goel (https://github.com/vidit98)
4
+ # ------------------------------------------------------------------------------
5
+
6
+ import os
7
+ import random
8
+ # fmt: off
9
+ import sys
10
+ sys.path.insert(1, './annotator/OneFormer')
11
+ # fmt: on
12
+
13
+ import imutils
14
+ import cv2
15
+ import numpy as np
16
+
17
+ from detectron2.config import get_cfg
18
+ from detectron2.projects.deeplab import add_deeplab_config
19
+ from detectron2.data import MetadataCatalog
20
+
21
+ from oneformer import (
22
+ add_oneformer_config,
23
+ add_common_config,
24
+ add_swin_config,
25
+ add_dinat_config,
26
+ add_convnext_config,
27
+ )
28
+ from demo.defaults import DefaultPredictor
29
+
30
+
31
+ def setup_cfg(config_file, wts):
32
+ # load config from file and command-line arguments
33
+ cfg = get_cfg()
34
+ add_deeplab_config(cfg)
35
+ add_common_config(cfg)
36
+ add_swin_config(cfg)
37
+ add_dinat_config(cfg)
38
+ add_convnext_config(cfg)
39
+ add_oneformer_config(cfg)
40
+ cfg.merge_from_file(config_file)
41
+ cfg.MODEL.WEIGHTS = wts
42
+ cfg.freeze()
43
+ return cfg
44
+
45
+
46
+ class OneformerSegmenter:
47
+ def __init__(self, wts, config='./annotator/OneFormer/configs/coco/swin/oneformer_swin_large_bs16_100ep.yaml',confidence_thresh=0.5):
48
+ cfg = setup_cfg(config, wts)
49
+ metadata = MetadataCatalog.get(cfg.DATASETS.TEST_PANOPTIC[0] if len(cfg.DATASETS.TEST_PANOPTIC) else "__unused")
50
+ self.predictor = DefaultPredictor(cfg)
51
+ self.metadata = metadata
52
+
53
+ def __call__(self, img, task):
54
+ if task == 'panoptic':
55
+ predictions = self.predictor(img, "panoptic")
56
+ panoptic_seg, segments_info = predictions["panoptic_seg"]
57
+ return panoptic_seg, segments_info
58
+ elif task == 'semantic':
59
+ predictions = self.predictor(img, "semantic")
60
+ semask = predictions["sem_seg"].argmax(dim=0)
61
+ return semask
annotator/OneFormer/configs/ade20k/Base-ADE20K-UnifiedSegmentation.yaml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ BACKBONE:
3
+ FREEZE_AT: 0
4
+ NAME: "build_resnet_backbone"
5
+ WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
7
+ PIXEL_STD: [58.395, 57.120, 57.375]
8
+ RESNETS:
9
+ DEPTH: 50
10
+ STEM_TYPE: "basic" # not used
11
+ STEM_OUT_CHANNELS: 64
12
+ STRIDE_IN_1X1: False
13
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14
+ # NORM: "SyncBN"
15
+ RES5_MULTI_GRID: [1, 1, 1] # not used
16
+ DATASETS:
17
+ TRAIN: ("ade20k_panoptic_train",)
18
+ TEST_PANOPTIC: ("ade20k_panoptic_val",)
19
+ TEST_INSTANCE: ("ade20k_instance_val",)
20
+ TEST_SEMANTIC: ("ade20k_sem_seg_val",)
21
+ SOLVER:
22
+ IMS_PER_BATCH: 16
23
+ BASE_LR: 0.0001
24
+ MAX_ITER: 160000
25
+ WARMUP_FACTOR: 1.0
26
+ WARMUP_ITERS: 0
27
+ WEIGHT_DECAY: 0.05
28
+ OPTIMIZER: "ADAMW"
29
+ LR_SCHEDULER_NAME: "WarmupPolyLR"
30
+ BACKBONE_MULTIPLIER: 0.1
31
+ CLIP_GRADIENTS:
32
+ ENABLED: True
33
+ CLIP_TYPE: "full_model"
34
+ CLIP_VALUE: 0.01
35
+ NORM_TYPE: 2.0
36
+ AMP:
37
+ ENABLED: True
38
+ INPUT:
39
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
40
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
41
+ MIN_SIZE_TEST: 512
42
+ MAX_SIZE_TRAIN: 2048
43
+ MAX_SIZE_TEST: 2048
44
+ CROP:
45
+ ENABLED: True
46
+ TYPE: "absolute"
47
+ SIZE: (512, 512)
48
+ SINGLE_CATEGORY_MAX_AREA: 1.0
49
+ COLOR_AUG_SSD: True
50
+ SIZE_DIVISIBILITY: 512 # used in dataset mapper
51
+ FORMAT: "RGB"
52
+ DATASET_MAPPER_NAME: "oneformer_unified"
53
+ MAX_SEQ_LEN: 77
54
+ TASK_SEQ_LEN: 77
55
+ TASK_PROB:
56
+ SEMANTIC: 0.33
57
+ INSTANCE: 0.66
58
+ TEST:
59
+ EVAL_PERIOD: 5000
60
+ AUG:
61
+ ENABLED: False
62
+ MIN_SIZES: [256, 384, 512, 640, 768, 896]
63
+ MAX_SIZE: 3584
64
+ FLIP: True
65
+ DATALOADER:
66
+ FILTER_EMPTY_ANNOTATIONS: True
67
+ NUM_WORKERS: 4
68
+ VERSION: 2
annotator/OneFormer/configs/ade20k/convnext/oneformer_convnext_large_bs16_160k.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_160k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2ConvNeXt"
5
+ CONVNEXT:
6
+ IN_CHANNELS: 3
7
+ DEPTHS: [3, 3, 27, 3]
8
+ DIMS: [192, 384, 768, 1536]
9
+ DROP_PATH_RATE: 0.4
10
+ LSIT: 1.0
11
+ OUT_INDICES: [0, 1, 2, 3]
12
+ WEIGHTS: "convnext_large_22k_1k_384.pkl"
13
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
14
+ PIXEL_STD: [58.395, 57.120, 57.375]
15
+ ONE_FORMER:
16
+ NUM_OBJECT_QUERIES: 250
17
+ INPUT:
18
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
20
+ MIN_SIZE_TEST: 640
21
+ MAX_SIZE_TRAIN: 2560
22
+ MAX_SIZE_TEST: 2560
23
+ CROP:
24
+ ENABLED: True
25
+ TYPE: "absolute"
26
+ SIZE: (640, 640)
27
+ SINGLE_CATEGORY_MAX_AREA: 1.0
28
+ COLOR_AUG_SSD: True
29
+ SIZE_DIVISIBILITY: 640 # used in dataset mapper
30
+ FORMAT: "RGB"
31
+ TEST:
32
+ DETECTIONS_PER_IMAGE: 250
33
+ EVAL_PERIOD: 5000
34
+ AUG:
35
+ ENABLED: False
36
+ MIN_SIZES: [320, 480, 640, 800, 960, 1120]
37
+ MAX_SIZE: 4480
38
+ FLIP: True
annotator/OneFormer/configs/ade20k/convnext/oneformer_convnext_xlarge_bs16_160k.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_160k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2ConvNeXt"
5
+ CONVNEXT:
6
+ IN_CHANNELS: 3
7
+ DEPTHS: [3, 3, 27, 3]
8
+ DIMS: [256, 512, 1024, 2048]
9
+ DROP_PATH_RATE: 0.4
10
+ LSIT: 1.0
11
+ OUT_INDICES: [0, 1, 2, 3]
12
+ WEIGHTS: "convnext_xlarge_22k_1k_384_ema.pkl"
13
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
14
+ PIXEL_STD: [58.395, 57.120, 57.375]
15
+ ONE_FORMER:
16
+ NUM_OBJECT_QUERIES: 250
17
+ INPUT:
18
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
19
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
20
+ MIN_SIZE_TEST: 640
21
+ MAX_SIZE_TRAIN: 2560
22
+ MAX_SIZE_TEST: 2560
23
+ CROP:
24
+ ENABLED: True
25
+ TYPE: "absolute"
26
+ SIZE: (640, 640)
27
+ SINGLE_CATEGORY_MAX_AREA: 1.0
28
+ COLOR_AUG_SSD: True
29
+ SIZE_DIVISIBILITY: 640 # used in dataset mapper
30
+ FORMAT: "RGB"
31
+ TEST:
32
+ DETECTIONS_PER_IMAGE: 250
33
+ EVAL_PERIOD: 5000
34
+ AUG:
35
+ ENABLED: False
36
+ MIN_SIZES: [320, 480, 640, 800, 960, 1120]
37
+ MAX_SIZE: 4480
38
+ FLIP: True
annotator/OneFormer/configs/ade20k/dinat/coco_pretrain_oneformer_dinat_large_bs16_160k_1280x1280.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_160k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2DiNAT"
5
+ DiNAT:
6
+ EMBED_DIM: 192
7
+ MLP_RATIO: 2.0
8
+ DEPTHS: [3, 4, 18, 5]
9
+ NUM_HEADS: [6, 12, 24, 48]
10
+ KERNEL_SIZE: 11
11
+ DROP_PATH_RATE: 0.3
12
+ DILATIONS: [[1, 28, 1], [1, 7, 1, 14], [1, 3, 1, 5, 1, 5, 1, 7, 1, 3, 1, 5, 1, 5, 1, 7, 1, 7], [1, 3, 1, 3, 1]]
13
+ WEIGHTS: "150_16_dinat_l_oneformer_coco_100ep.pth"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
16
+ ONE_FORMER:
17
+ NUM_OBJECT_QUERIES: 150
18
+ SOLVER:
19
+ AMP:
20
+ ENABLED: False
21
+ INPUT:
22
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1280) for x in range(5, 21)]"]
23
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
24
+ MIN_SIZE_TEST: 1280
25
+ MAX_SIZE_TRAIN: 5120
26
+ MAX_SIZE_TEST: 5120
27
+ CROP:
28
+ ENABLED: True
29
+ TYPE: "absolute"
30
+ SIZE: (1280, 1280)
31
+ SINGLE_CATEGORY_MAX_AREA: 1.0
32
+ COLOR_AUG_SSD: True
33
+ SIZE_DIVISIBILITY: 1280 # used in dataset mapper
34
+ FORMAT: "RGB"
35
+ TEST:
36
+ DETECTIONS_PER_IMAGE: 150
37
+ EVAL_PERIOD: 5000
38
+ AUG:
39
+ ENABLED: False
40
+ MIN_SIZES: [640, 960, 1280, 1600, 1920, 2240]
41
+ MAX_SIZE: 8960
42
+ FLIP: True
annotator/OneFormer/configs/ade20k/dinat/oneformer_dinat_large_bs16_160k.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_160k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2DiNAT"
5
+ DiNAT:
6
+ EMBED_DIM: 192
7
+ MLP_RATIO: 2.0
8
+ DEPTHS: [3, 4, 18, 5]
9
+ NUM_HEADS: [6, 12, 24, 48]
10
+ KERNEL_SIZE: 11
11
+ DROP_PATH_RATE: 0.3
12
+ DILATIONS: [[1, 20, 1], [1, 5, 1, 10], [1, 2, 1, 3, 1, 4, 1, 5, 1, 2, 1, 3, 1, 4, 1, 5, 1, 5], [1, 2, 1, 2, 1]]
13
+ WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
16
+ ONE_FORMER:
17
+ NUM_OBJECT_QUERIES: 250
18
+ SOLVER:
19
+ AMP:
20
+ ENABLED: False
21
+ INPUT:
22
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
23
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
24
+ MIN_SIZE_TEST: 640
25
+ MAX_SIZE_TRAIN: 2560
26
+ MAX_SIZE_TEST: 2560
27
+ CROP:
28
+ ENABLED: True
29
+ TYPE: "absolute"
30
+ SIZE: (640, 640)
31
+ SINGLE_CATEGORY_MAX_AREA: 1.0
32
+ COLOR_AUG_SSD: True
33
+ SIZE_DIVISIBILITY: 640 # used in dataset mapper
34
+ FORMAT: "RGB"
35
+ TEST:
36
+ DETECTIONS_PER_IMAGE: 250
37
+ EVAL_PERIOD: 5000
38
+ AUG:
39
+ ENABLED: False
40
+ MIN_SIZES: [320, 480, 640, 800, 960, 1120]
41
+ MAX_SIZE: 4480
42
+ FLIP: True
annotator/OneFormer/configs/ade20k/dinat/oneformer_dinat_large_bs16_160k_1280x1280.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_160k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2DiNAT"
5
+ DiNAT:
6
+ EMBED_DIM: 192
7
+ MLP_RATIO: 2.0
8
+ DEPTHS: [3, 4, 18, 5]
9
+ NUM_HEADS: [6, 12, 24, 48]
10
+ KERNEL_SIZE: 11
11
+ DROP_PATH_RATE: 0.3
12
+ DILATIONS: [[1, 28, 1], [1, 7, 1, 14], [1, 3, 1, 5, 1, 5, 1, 7, 1, 3, 1, 5, 1, 5, 1, 7, 1, 7], [1, 3, 1, 3, 1]]
13
+ WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
16
+ ONE_FORMER:
17
+ NUM_OBJECT_QUERIES: 250
18
+ SOLVER:
19
+ AMP:
20
+ ENABLED: False
21
+ INPUT:
22
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1280) for x in range(5, 21)]"]
23
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
24
+ MIN_SIZE_TEST: 1280
25
+ MAX_SIZE_TRAIN: 5120
26
+ MAX_SIZE_TEST: 5120
27
+ CROP:
28
+ ENABLED: True
29
+ TYPE: "absolute"
30
+ SIZE: (1280, 1280)
31
+ SINGLE_CATEGORY_MAX_AREA: 1.0
32
+ COLOR_AUG_SSD: True
33
+ SIZE_DIVISIBILITY: 1280 # used in dataset mapper
34
+ FORMAT: "RGB"
35
+ TEST:
36
+ DETECTIONS_PER_IMAGE: 250
37
+ EVAL_PERIOD: 5000
38
+ AUG:
39
+ ENABLED: False
40
+ MIN_SIZES: [640, 960, 1280, 1600, 1920, 2240]
41
+ MAX_SIZE: 8960
42
+ FLIP: True
annotator/OneFormer/configs/ade20k/dinat/oneformer_dinat_large_bs16_160k_896x896.yaml ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_160k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2DiNAT"
5
+ DiNAT:
6
+ EMBED_DIM: 192
7
+ MLP_RATIO: 2.0
8
+ DEPTHS: [3, 4, 18, 5]
9
+ NUM_HEADS: [6, 12, 24, 48]
10
+ KERNEL_SIZE: 11
11
+ DROP_PATH_RATE: 0.3
12
+ DILATIONS: [[1, 20, 1], [1, 5, 1, 10], [1, 2, 1, 3, 1, 4, 1, 5, 1, 2, 1, 3, 1, 4, 1, 5, 1, 5], [1, 2, 1, 2, 1]]
13
+ WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
16
+ ONE_FORMER:
17
+ NUM_OBJECT_QUERIES: 250
18
+ SOLVER:
19
+ AMP:
20
+ ENABLED: False
21
+ INPUT:
22
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 896) for x in range(5, 21)]"]
23
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
24
+ MIN_SIZE_TEST: 896
25
+ MAX_SIZE_TRAIN: 3584
26
+ MAX_SIZE_TEST: 3584
27
+ CROP:
28
+ ENABLED: True
29
+ TYPE: "absolute"
30
+ SIZE: (896, 896)
31
+ SINGLE_CATEGORY_MAX_AREA: 1.0
32
+ COLOR_AUG_SSD: True
33
+ SIZE_DIVISIBILITY: 896 # used in dataset mapper
34
+ FORMAT: "RGB"
35
+ TEST:
36
+ DETECTIONS_PER_IMAGE: 250
37
+ EVAL_PERIOD: 5000
38
+ AUG:
39
+ ENABLED: False
40
+ MIN_SIZES: [448, 678, 896, 1120, 1344, 1568]
41
+ MAX_SIZE: 6272
42
+ FLIP: True
annotator/OneFormer/configs/ade20k/oneformer_R50_bs16_160k.yaml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: Base-ADE20K-UnifiedSegmentation.yaml
2
+ MODEL:
3
+ META_ARCHITECTURE: "OneFormer"
4
+ SEM_SEG_HEAD:
5
+ NAME: "OneFormerHead"
6
+ IGNORE_VALUE: 255
7
+ NUM_CLASSES: 150
8
+ LOSS_WEIGHT: 1.0
9
+ CONVS_DIM: 256
10
+ MASK_DIM: 256
11
+ NORM: "GN"
12
+ # pixel decoder
13
+ PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
15
+ DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16
+ COMMON_STRIDE: 4
17
+ TRANSFORMER_ENC_LAYERS: 6
18
+ ONE_FORMER:
19
+ TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder"
20
+ TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21
+ DEEP_SUPERVISION: True
22
+ NO_OBJECT_WEIGHT: 0.1
23
+ CLASS_WEIGHT: 2.0
24
+ MASK_WEIGHT: 5.0
25
+ DICE_WEIGHT: 5.0
26
+ CONTRASTIVE_WEIGHT: 0.5
27
+ CONTRASTIVE_TEMPERATURE: 0.07
28
+ HIDDEN_DIM: 256
29
+ NUM_OBJECT_QUERIES: 150
30
+ USE_TASK_NORM: True
31
+ NHEADS: 8
32
+ DROPOUT: 0.1
33
+ DIM_FEEDFORWARD: 2048
34
+ ENC_LAYERS: 0
35
+ PRE_NORM: False
36
+ ENFORCE_INPUT_PROJ: False
37
+ SIZE_DIVISIBILITY: 32
38
+ CLASS_DEC_LAYERS: 2
39
+ DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
40
+ TRAIN_NUM_POINTS: 12544
41
+ OVERSAMPLE_RATIO: 3.0
42
+ IMPORTANCE_SAMPLE_RATIO: 0.75
43
+ TEXT_ENCODER:
44
+ WIDTH: 256
45
+ CONTEXT_LENGTH: 77
46
+ NUM_LAYERS: 6
47
+ VOCAB_SIZE: 49408
48
+ PROJ_NUM_LAYERS: 2
49
+ N_CTX: 16
50
+ TEST:
51
+ SEMANTIC_ON: True
52
+ INSTANCE_ON: True
53
+ PANOPTIC_ON: True
54
+ OVERLAP_THRESHOLD: 0.8
55
+ OBJECT_MASK_THRESHOLD: 0.5
56
+ TASK: "panoptic"
57
+ TEST:
58
+ DETECTIONS_PER_IMAGE: 150
annotator/OneFormer/configs/ade20k/swin/oneformer_swin_large_bs16_160k.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_160k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 192
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [6, 12, 24, 48]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
17
+ ONE_FORMER:
18
+ NUM_OBJECT_QUERIES: 250
19
+ INPUT:
20
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
21
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
22
+ MIN_SIZE_TEST: 640
23
+ MAX_SIZE_TRAIN: 2560
24
+ MAX_SIZE_TEST: 2560
25
+ CROP:
26
+ ENABLED: True
27
+ TYPE: "absolute"
28
+ SIZE: (640, 640)
29
+ SINGLE_CATEGORY_MAX_AREA: 1.0
30
+ COLOR_AUG_SSD: True
31
+ SIZE_DIVISIBILITY: 640 # used in dataset mapper
32
+ FORMAT: "RGB"
33
+ TEST:
34
+ DETECTIONS_PER_IMAGE: 250
35
+ EVAL_PERIOD: 5000
36
+ AUG:
37
+ ENABLED: False
38
+ MIN_SIZES: [320, 480, 640, 800, 960, 1120]
39
+ MAX_SIZE: 4480
40
+ FLIP: True
annotator/OneFormer/configs/ade20k/swin/oneformer_swin_large_bs16_160k_1280x1280.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_160k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 192
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [6, 12, 24, 48]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
17
+ ONE_FORMER:
18
+ NUM_OBJECT_QUERIES: 250
19
+ INPUT:
20
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1280) for x in range(5, 21)]"]
21
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
22
+ MIN_SIZE_TEST: 1280
23
+ MAX_SIZE_TRAIN: 5120
24
+ MAX_SIZE_TEST: 5120
25
+ CROP:
26
+ ENABLED: True
27
+ TYPE: "absolute"
28
+ SIZE: (1280, 1280)
29
+ SINGLE_CATEGORY_MAX_AREA: 1.0
30
+ COLOR_AUG_SSD: True
31
+ SIZE_DIVISIBILITY: 1280 # used in dataset mapper
32
+ FORMAT: "RGB"
33
+ TEST:
34
+ DETECTIONS_PER_IMAGE: 250
35
+ EVAL_PERIOD: 5000
36
+ AUG:
37
+ ENABLED: False
38
+ MIN_SIZES: [640, 960, 1280, 1600, 1920, 2240]
39
+ MAX_SIZE: 8960
40
+ FLIP: True
annotator/OneFormer/configs/ade20k/swin/oneformer_swin_large_bs16_160k_896x896.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_160k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 192
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [6, 12, 24, 48]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
17
+ ONE_FORMER:
18
+ NUM_OBJECT_QUERIES: 250
19
+ INPUT:
20
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 896) for x in range(5, 21)]"]
21
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
22
+ MIN_SIZE_TEST: 896
23
+ MAX_SIZE_TRAIN: 3584
24
+ MAX_SIZE_TEST: 3584
25
+ CROP:
26
+ ENABLED: True
27
+ TYPE: "absolute"
28
+ SIZE: (896, 896)
29
+ SINGLE_CATEGORY_MAX_AREA: 1.0
30
+ COLOR_AUG_SSD: True
31
+ SIZE_DIVISIBILITY: 896 # used in dataset mapper
32
+ FORMAT: "RGB"
33
+ TEST:
34
+ DETECTIONS_PER_IMAGE: 250
35
+ EVAL_PERIOD: 5000
36
+ AUG:
37
+ ENABLED: False
38
+ MIN_SIZES: [448, 678, 896, 1120, 1344, 1568]
39
+ MAX_SIZE: 6272
40
+ FLIP: True
annotator/OneFormer/configs/ade20k/swin/oneformer_swin_tiny_bs16_160k.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_160k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 96
7
+ DEPTHS: [2, 2, 6, 2]
8
+ NUM_HEADS: [3, 6, 12, 24]
9
+ WINDOW_SIZE: 7
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
annotator/OneFormer/configs/cityscapes/Base-Cityscapes-UnifiedSegmentation.yaml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ BACKBONE:
3
+ FREEZE_AT: 0
4
+ NAME: "build_resnet_backbone"
5
+ WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
7
+ PIXEL_STD: [58.395, 57.120, 57.375]
8
+ RESNETS:
9
+ DEPTH: 50
10
+ STEM_TYPE: "basic" # not used
11
+ STEM_OUT_CHANNELS: 64
12
+ STRIDE_IN_1X1: False
13
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14
+ NORM: "SyncBN" # use syncbn for cityscapes dataset
15
+ RES5_MULTI_GRID: [1, 1, 1] # not used
16
+ DATASETS:
17
+ TRAIN: ("cityscapes_fine_panoptic_train",)
18
+ TEST_PANOPTIC: ("cityscapes_fine_panoptic_val",)
19
+ TEST_INSTANCE: ("cityscapes_fine_instance_seg_val",)
20
+ TEST_SEMANTIC: ("cityscapes_fine_sem_seg_val",)
21
+ SOLVER:
22
+ IMS_PER_BATCH: 16
23
+ BASE_LR: 0.0001
24
+ MAX_ITER: 90000
25
+ WARMUP_FACTOR: 1.0
26
+ WARMUP_ITERS: 0
27
+ WEIGHT_DECAY: 0.05
28
+ OPTIMIZER: "ADAMW"
29
+ LR_SCHEDULER_NAME: "WarmupPolyLR"
30
+ BACKBONE_MULTIPLIER: 0.1
31
+ CLIP_GRADIENTS:
32
+ ENABLED: True
33
+ CLIP_TYPE: "full_model"
34
+ CLIP_VALUE: 0.01
35
+ NORM_TYPE: 2.0
36
+ AMP:
37
+ ENABLED: True
38
+ INPUT:
39
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
40
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
41
+ MIN_SIZE_TEST: 1024
42
+ MAX_SIZE_TRAIN: 4096
43
+ MAX_SIZE_TEST: 2048
44
+ CROP:
45
+ ENABLED: True
46
+ TYPE: "absolute"
47
+ SIZE: (512, 1024)
48
+ SINGLE_CATEGORY_MAX_AREA: 1.0
49
+ COLOR_AUG_SSD: True
50
+ SIZE_DIVISIBILITY: -1
51
+ FORMAT: "RGB"
52
+ DATASET_MAPPER_NAME: "oneformer_unified"
53
+ MAX_SEQ_LEN: 77
54
+ TASK_SEQ_LEN: 77
55
+ TASK_PROB:
56
+ SEMANTIC: 0.33
57
+ INSTANCE: 0.66
58
+ TEST:
59
+ EVAL_PERIOD: 5000
60
+ AUG:
61
+ ENABLED: False
62
+ MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
63
+ MAX_SIZE: 4096
64
+ FLIP: True
65
+ DATALOADER:
66
+ FILTER_EMPTY_ANNOTATIONS: True
67
+ NUM_WORKERS: 4
68
+ VERSION: 2
annotator/OneFormer/configs/cityscapes/convnext/mapillary_pretrain_oneformer_convnext_large_bs16_90k.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_90k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2ConvNeXt"
5
+ CONVNEXT:
6
+ IN_CHANNELS: 3
7
+ DEPTHS: [3, 3, 27, 3]
8
+ DIMS: [192, 384, 768, 1536]
9
+ DROP_PATH_RATE: 0.4
10
+ LSIT: 1.0
11
+ OUT_INDICES: [0, 1, 2, 3]
12
+ WEIGHTS: "250_16_convnext_l_oneformer_mapillary_300k.pth"
13
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
14
+ PIXEL_STD: [58.395, 57.120, 57.375]
15
+ ONE_FORMER:
16
+ NUM_OBJECT_QUERIES: 250
17
+ TEST:
18
+ DETECTIONS_PER_IMAGE: 250
annotator/OneFormer/configs/cityscapes/convnext/oneformer_convnext_large_bs16_90k.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_90k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2ConvNeXt"
5
+ CONVNEXT:
6
+ IN_CHANNELS: 3
7
+ DEPTHS: [3, 3, 27, 3]
8
+ DIMS: [192, 384, 768, 1536]
9
+ DROP_PATH_RATE: 0.4
10
+ LSIT: 1.0
11
+ OUT_INDICES: [0, 1, 2, 3]
12
+ WEIGHTS: "convnext_large_22k_1k_384.pkl"
13
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
14
+ PIXEL_STD: [58.395, 57.120, 57.375]
15
+ ONE_FORMER:
16
+ NUM_OBJECT_QUERIES: 250
17
+ TEST:
18
+ DETECTIONS_PER_IMAGE: 250
annotator/OneFormer/configs/cityscapes/convnext/oneformer_convnext_xlarge_bs16_90k.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_90k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2ConvNeXt"
5
+ CONVNEXT:
6
+ IN_CHANNELS: 3
7
+ DEPTHS: [3, 3, 27, 3]
8
+ DIMS: [256, 512, 1024, 2048]
9
+ DROP_PATH_RATE: 0.4
10
+ LSIT: 1.0
11
+ OUT_INDICES: [0, 1, 2, 3]
12
+ WEIGHTS: "convnext_xlarge_22k_1k_384_ema.pkl"
13
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
14
+ PIXEL_STD: [58.395, 57.120, 57.375]
15
+ ONE_FORMER:
16
+ NUM_OBJECT_QUERIES: 250
17
+ TEST:
18
+ DETECTIONS_PER_IMAGE: 250
annotator/OneFormer/configs/cityscapes/dinat/oneformer_dinat_large_bs16_90k.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_90k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2DiNAT"
5
+ DiNAT:
6
+ EMBED_DIM: 192
7
+ MLP_RATIO: 2.0
8
+ DEPTHS: [3, 4, 18, 5]
9
+ NUM_HEADS: [6, 12, 24, 48]
10
+ KERNEL_SIZE: 7
11
+ DROP_PATH_RATE: 0.3
12
+ DILATIONS: [[1, 18, 1], [1, 5, 1, 9], [1, 2, 1, 3, 1, 4, 1, 2, 1, 3, 1, 4, 1, 2, 1, 3, 1, 4], [1, 2, 1, 2, 1]]
13
+ WEIGHTS: "dinat_large_in22k_224.pkl"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
16
+ ONE_FORMER:
17
+ NUM_OBJECT_QUERIES: 250
18
+ SOLVER:
19
+ AMP:
20
+ ENABLED: False
21
+ TEST:
22
+ DETECTIONS_PER_IMAGE: 250
annotator/OneFormer/configs/cityscapes/oneformer_R50_bs16_90k.yaml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: Base-Cityscapes-UnifiedSegmentation.yaml
2
+ MODEL:
3
+ META_ARCHITECTURE: "OneFormer"
4
+ SEM_SEG_HEAD:
5
+ NAME: "OneFormerHead"
6
+ IGNORE_VALUE: 255
7
+ NUM_CLASSES: 19
8
+ LOSS_WEIGHT: 1.0
9
+ CONVS_DIM: 256
10
+ MASK_DIM: 256
11
+ NORM: "GN"
12
+ # pixel decoder
13
+ PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
15
+ DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16
+ COMMON_STRIDE: 4
17
+ TRANSFORMER_ENC_LAYERS: 6
18
+ ONE_FORMER:
19
+ TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder"
20
+ TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21
+ DEEP_SUPERVISION: True
22
+ NO_OBJECT_WEIGHT: 0.1
23
+ CLASS_WEIGHT: 2.0
24
+ MASK_WEIGHT: 5.0
25
+ DICE_WEIGHT: 5.0
26
+ CONTRASTIVE_WEIGHT: 0.5
27
+ CONTRASTIVE_TEMPERATURE: 0.07
28
+ HIDDEN_DIM: 256
29
+ NUM_OBJECT_QUERIES: 150
30
+ USE_TASK_NORM: True
31
+ NHEADS: 8
32
+ DROPOUT: 0.1
33
+ DIM_FEEDFORWARD: 2048
34
+ ENC_LAYERS: 0
35
+ PRE_NORM: False
36
+ ENFORCE_INPUT_PROJ: False
37
+ SIZE_DIVISIBILITY: 32
38
+ ENC_LAYERS: 0
39
+ CLASS_DEC_LAYERS: 2
40
+ DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
41
+ TRAIN_NUM_POINTS: 12544
42
+ OVERSAMPLE_RATIO: 3.0
43
+ IMPORTANCE_SAMPLE_RATIO: 0.75
44
+ TEXT_ENCODER:
45
+ WIDTH: 256
46
+ CONTEXT_LENGTH: 77
47
+ NUM_LAYERS: 6
48
+ VOCAB_SIZE: 49408
49
+ PROJ_NUM_LAYERS: 2
50
+ N_CTX: 16
51
+ TEST:
52
+ SEMANTIC_ON: True
53
+ INSTANCE_ON: True
54
+ PANOPTIC_ON: True
55
+ OVERLAP_THRESHOLD: 0.8
56
+ OBJECT_MASK_THRESHOLD: 0.8
57
+ TASK: "panoptic"
58
+ TEST:
59
+ DETECTIONS_PER_IMAGE: 150
annotator/OneFormer/configs/cityscapes/swin/oneformer_swin_large_bs16_90k.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_90k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 192
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [6, 12, 24, 48]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
17
+ ONE_FORMER:
18
+ NUM_OBJECT_QUERIES: 250
19
+ TEST:
20
+ DETECTIONS_PER_IMAGE: 250
annotator/OneFormer/configs/coco/Base-COCO-UnifiedSegmentation.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ BACKBONE:
3
+ FREEZE_AT: 0
4
+ NAME: "build_resnet_backbone"
5
+ WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
7
+ PIXEL_STD: [58.395, 57.120, 57.375]
8
+ RESNETS:
9
+ DEPTH: 50
10
+ STEM_TYPE: "basic" # not used
11
+ STEM_OUT_CHANNELS: 64
12
+ STRIDE_IN_1X1: False
13
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14
+ # NORM: "SyncBN"
15
+ RES5_MULTI_GRID: [1, 1, 1] # not used
16
+ DATASETS:
17
+ TRAIN: ("coco_2017_train_panoptic_with_sem_seg",)
18
+ TEST_PANOPTIC: ("coco_2017_val_panoptic_with_sem_seg",) # to evaluate instance and semantic performance as well
19
+ TEST_INSTANCE: ("coco_2017_val",)
20
+ TEST_SEMANTIC: ("coco_2017_val_panoptic_with_sem_seg",)
21
+ SOLVER:
22
+ IMS_PER_BATCH: 16
23
+ BASE_LR: 0.0001
24
+ STEPS: (327778, 355092)
25
+ MAX_ITER: 368750
26
+ WARMUP_FACTOR: 1.0
27
+ WARMUP_ITERS: 10
28
+ WEIGHT_DECAY: 0.05
29
+ OPTIMIZER: "ADAMW"
30
+ BACKBONE_MULTIPLIER: 0.1
31
+ CLIP_GRADIENTS:
32
+ ENABLED: True
33
+ CLIP_TYPE: "full_model"
34
+ CLIP_VALUE: 0.01
35
+ NORM_TYPE: 2.0
36
+ AMP:
37
+ ENABLED: True
38
+ INPUT:
39
+ IMAGE_SIZE: 1024
40
+ MIN_SCALE: 0.1
41
+ MAX_SCALE: 2.0
42
+ FORMAT: "RGB"
43
+ DATASET_MAPPER_NAME: "coco_unified_lsj"
44
+ MAX_SEQ_LEN: 77
45
+ TASK_SEQ_LEN: 77
46
+ TASK_PROB:
47
+ SEMANTIC: 0.33
48
+ INSTANCE: 0.66
49
+ TEST:
50
+ EVAL_PERIOD: 5000
51
+ DATALOADER:
52
+ FILTER_EMPTY_ANNOTATIONS: True
53
+ NUM_WORKERS: 4
54
+ VERSION: 2
annotator/OneFormer/configs/coco/dinat/oneformer_dinat_large_bs16_100ep.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_50ep.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2DiNAT"
5
+ DiNAT:
6
+ EMBED_DIM: 192
7
+ MLP_RATIO: 2.0
8
+ DEPTHS: [3, 4, 18, 5]
9
+ NUM_HEADS: [6, 12, 24, 48]
10
+ KERNEL_SIZE: 11
11
+ DROP_PATH_RATE: 0.3
12
+ DILATIONS: [[1, 20, 1], [1, 5, 1, 10], [1, 2, 1, 3, 1, 4, 1, 5, 1, 2, 1, 3, 1, 4, 1, 5, 1, 5], [1, 2, 1, 2, 1]]
13
+ WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
16
+ ONE_FORMER:
17
+ NUM_OBJECT_QUERIES: 150
18
+ SOLVER:
19
+ STEPS: (655556, 710184)
20
+ MAX_ITER: 737500
21
+ AMP:
22
+ ENABLED: False
23
+ TEST:
24
+ DETECTIONS_PER_IMAGE: 150
annotator/OneFormer/configs/coco/oneformer_R50_bs16_50ep.yaml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: Base-COCO-UnifiedSegmentation.yaml
2
+ MODEL:
3
+ META_ARCHITECTURE: "OneFormer"
4
+ SEM_SEG_HEAD:
5
+ NAME: "OneFormerHead"
6
+ IGNORE_VALUE: 255
7
+ NUM_CLASSES: 133
8
+ LOSS_WEIGHT: 1.0
9
+ CONVS_DIM: 256
10
+ MASK_DIM: 256
11
+ NORM: "GN"
12
+ # pixel decoder
13
+ PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
15
+ DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16
+ COMMON_STRIDE: 4
17
+ TRANSFORMER_ENC_LAYERS: 6
18
+ ONE_FORMER:
19
+ TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder"
20
+ TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21
+ DEEP_SUPERVISION: True
22
+ NO_OBJECT_WEIGHT: 0.1
23
+ CLASS_WEIGHT: 2.0
24
+ MASK_WEIGHT: 5.0
25
+ DICE_WEIGHT: 5.0
26
+ CONTRASTIVE_WEIGHT: 0.5
27
+ CONTRASTIVE_TEMPERATURE: 0.07
28
+ HIDDEN_DIM: 256
29
+ NUM_OBJECT_QUERIES: 150
30
+ USE_TASK_NORM: True
31
+ NHEADS: 8
32
+ DROPOUT: 0.1
33
+ DIM_FEEDFORWARD: 2048
34
+ ENC_LAYERS: 0
35
+ PRE_NORM: False
36
+ ENFORCE_INPUT_PROJ: False
37
+ SIZE_DIVISIBILITY: 32
38
+ CLASS_DEC_LAYERS: 2
39
+ DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
40
+ TRAIN_NUM_POINTS: 12544
41
+ OVERSAMPLE_RATIO: 3.0
42
+ IMPORTANCE_SAMPLE_RATIO: 0.75
43
+ TEXT_ENCODER:
44
+ WIDTH: 256
45
+ CONTEXT_LENGTH: 77
46
+ NUM_LAYERS: 6
47
+ VOCAB_SIZE: 49408
48
+ PROJ_NUM_LAYERS: 2
49
+ N_CTX: 16
50
+ TEST:
51
+ SEMANTIC_ON: True
52
+ INSTANCE_ON: True
53
+ PANOPTIC_ON: True
54
+ DETECTION_ON: False
55
+ OVERLAP_THRESHOLD: 0.8
56
+ OBJECT_MASK_THRESHOLD: 0.8
57
+ TASK: "panoptic"
58
+ TEST:
59
+ DETECTIONS_PER_IMAGE: 150
annotator/OneFormer/configs/coco/swin/oneformer_swin_large_bs16_100ep.yaml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_50ep.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+
6
+ IS_DEMO: True
7
+ IS_TRAIN: False
8
+ SWIN:
9
+ EMBED_DIM: 192
10
+ DEPTHS: [2, 2, 18, 2]
11
+ NUM_HEADS: [6, 12, 24, 48]
12
+ WINDOW_SIZE: 12
13
+ APE: False
14
+ DROP_PATH_RATE: 0.3
15
+ PATCH_NORM: True
16
+ PRETRAIN_IMG_SIZE: 384
17
+ WEIGHTS: "./checkpoints/150_16_swin_l_oneformer_coco_100ep.pth"
18
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
19
+ PIXEL_STD: [58.395, 57.120, 57.375]
20
+ ONE_FORMER:
21
+ NUM_OBJECT_QUERIES: 150
22
+ SOLVER:
23
+ STEPS: (655556, 735184)
24
+ MAX_ITER: 737500
25
+ AMP:
26
+ ENABLED: False
27
+ TEST:
28
+ DETECTIONS_PER_IMAGE: 150
annotator/OneFormer/configs/coco/swin/oneformer_swin_tiny_bs16_50ep.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_50ep.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 96
7
+ DEPTHS: [2, 2, 6, 2]
8
+ NUM_HEADS: [3, 6, 12, 24]
9
+ WINDOW_SIZE: 7
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
annotator/OneFormer/configs/mapillary_vistas/Base-Mapillary-UnifiedSegmentation.yaml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MODEL:
2
+ BACKBONE:
3
+ FREEZE_AT: 0
4
+ NAME: "build_resnet_backbone"
5
+ WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
6
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
7
+ PIXEL_STD: [58.395, 57.120, 57.375]
8
+ RESNETS:
9
+ DEPTH: 50
10
+ STEM_TYPE: "basic" # not used
11
+ STEM_OUT_CHANNELS: 64
12
+ STRIDE_IN_1X1: False
13
+ OUT_FEATURES: ["res2", "res3", "res4", "res5"]
14
+ NORM: "SyncBN" # use syncbn for cityscapes dataset
15
+ RES5_MULTI_GRID: [1, 1, 1] # not used
16
+ DATASETS:
17
+ TRAIN: ("mapillary_vistas_panoptic_train",)
18
+ TEST_PANOPTIC: ("mapillary_vistas_panoptic_val",)
19
+ TEST_INSTANCE: ("mapillary_vistas_panoptic_val",)
20
+ TEST_SEMANTIC: ("mapillary_vistas_sem_seg_val",)
21
+ SOLVER:
22
+ IMS_PER_BATCH: 16
23
+ BASE_LR: 0.0001
24
+ MAX_ITER: 300000
25
+ WARMUP_FACTOR: 1.0
26
+ WARMUP_ITERS: 0
27
+ WEIGHT_DECAY: 0.05
28
+ OPTIMIZER: "ADAMW"
29
+ LR_SCHEDULER_NAME: "WarmupPolyLR"
30
+ BACKBONE_MULTIPLIER: 0.1
31
+ CLIP_GRADIENTS:
32
+ ENABLED: True
33
+ CLIP_TYPE: "full_model"
34
+ CLIP_VALUE: 0.01
35
+ NORM_TYPE: 2.0
36
+ AMP:
37
+ ENABLED: True
38
+ INPUT:
39
+ MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"]
40
+ MIN_SIZE_TRAIN_SAMPLING: "choice"
41
+ MIN_SIZE_TEST: 2048
42
+ MAX_SIZE_TRAIN: 8192
43
+ MAX_SIZE_TEST: 2048
44
+ CROP:
45
+ ENABLED: True
46
+ TYPE: "absolute"
47
+ SIZE: (1024, 1024)
48
+ SINGLE_CATEGORY_MAX_AREA: 1.0
49
+ COLOR_AUG_SSD: True
50
+ SIZE_DIVISIBILITY: 1024 # used in dataset mapper
51
+ FORMAT: "RGB"
52
+ DATASET_MAPPER_NAME: "oneformer_unified"
53
+ MAX_SEQ_LEN: 77
54
+ TASK_SEQ_LEN: 77
55
+ TASK_PROB:
56
+ SEMANTIC: 0.50
57
+ INSTANCE: 0.50
58
+ TEST:
59
+ EVAL_PERIOD: 30000
60
+ AUG:
61
+ ENABLED: False
62
+ MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
63
+ MAX_SIZE: 4096
64
+ FLIP: True
65
+ DATALOADER:
66
+ FILTER_EMPTY_ANNOTATIONS: True
67
+ NUM_WORKERS: 10
68
+ VERSION: 2
annotator/OneFormer/configs/mapillary_vistas/convnext/cityscapes_pretrain_oneformer_convnext_large_bs16_300k.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_300k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2ConvNeXt"
5
+ CONVNEXT:
6
+ IN_CHANNELS: 3
7
+ DEPTHS: [3, 3, 27, 3]
8
+ DIMS: [192, 384, 768, 1536]
9
+ DROP_PATH_RATE: 0.4
10
+ LSIT: 1.0
11
+ OUT_INDICES: [0, 1, 2, 3]
12
+ WEIGHTS: "convnext_large_22k_1k_384.pkl"
13
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
14
+ PIXEL_STD: [58.395, 57.120, 57.375]
15
+ ONE_FORMER:
16
+ NUM_OBJECT_QUERIES: 250
17
+ INPUT:
18
+ TASK_PROB:
19
+ SEMANTIC: 0.33
20
+ INSTANCE: 0.66
21
+ TEST:
22
+ DETECTIONS_PER_IMAGE: 250
annotator/OneFormer/configs/mapillary_vistas/convnext/oneformer_convnext_large_bs16_300k.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_300k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2ConvNeXt"
5
+ CONVNEXT:
6
+ IN_CHANNELS: 3
7
+ DEPTHS: [3, 3, 27, 3]
8
+ DIMS: [192, 384, 768, 1536]
9
+ DROP_PATH_RATE: 0.4
10
+ LSIT: 1.0
11
+ OUT_INDICES: [0, 1, 2, 3]
12
+ WEIGHTS: "convnext_large_22k_1k_384.pkl"
13
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
14
+ PIXEL_STD: [58.395, 57.120, 57.375]
15
+ ONE_FORMER:
16
+ NUM_OBJECT_QUERIES: 250
17
+ TEST:
18
+ DETECTIONS_PER_IMAGE: 250
annotator/OneFormer/configs/mapillary_vistas/dinat/oneformer_dinat_large_bs16_300k.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_300k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2DiNAT"
5
+ DiNAT:
6
+ EMBED_DIM: 192
7
+ MLP_RATIO: 2.0
8
+ DEPTHS: [3, 4, 18, 5]
9
+ NUM_HEADS: [6, 12, 24, 48]
10
+ KERNEL_SIZE: 11
11
+ DROP_PATH_RATE: 0.3
12
+ DILATIONS: [[1, 20, 1], [1, 5, 1, 10], [1, 2, 1, 3, 1, 4, 1, 5, 1, 2, 1, 3, 1, 4, 1, 5, 1, 5], [1, 2, 1, 2, 1]]
13
+ WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl"
14
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
15
+ PIXEL_STD: [58.395, 57.120, 57.375]
16
+ ONE_FORMER:
17
+ NUM_OBJECT_QUERIES: 250
18
+ SOLVER:
19
+ AMP:
20
+ ENABLED: False
21
+ TEST:
22
+ DETECTIONS_PER_IMAGE: 250
annotator/OneFormer/configs/mapillary_vistas/oneformer_R50_bs16_300k.yaml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: Base-Mapillary-UnifiedSegmentation.yaml
2
+ MODEL:
3
+ META_ARCHITECTURE: "OneFormer"
4
+ SEM_SEG_HEAD:
5
+ NAME: "OneFormerHead"
6
+ IGNORE_VALUE: 65
7
+ NUM_CLASSES: 65
8
+ LOSS_WEIGHT: 1.0
9
+ CONVS_DIM: 256
10
+ MASK_DIM: 256
11
+ NORM: "GN"
12
+ # pixel decoder
13
+ PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
14
+ IN_FEATURES: ["res2", "res3", "res4", "res5"]
15
+ DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
16
+ COMMON_STRIDE: 4
17
+ TRANSFORMER_ENC_LAYERS: 6
18
+ ONE_FORMER:
19
+ TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder"
20
+ TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
21
+ DEEP_SUPERVISION: True
22
+ NO_OBJECT_WEIGHT: 0.1
23
+ CLASS_WEIGHT: 2.0
24
+ MASK_WEIGHT: 5.0
25
+ DICE_WEIGHT: 5.0
26
+ CONTRASTIVE_WEIGHT: 0.5
27
+ CONTRASTIVE_TEMPERATURE: 0.07
28
+ HIDDEN_DIM: 256
29
+ NUM_OBJECT_QUERIES: 150
30
+ USE_TASK_NORM: True
31
+ NHEADS: 8
32
+ DROPOUT: 0.1
33
+ DIM_FEEDFORWARD: 2048
34
+ ENC_LAYERS: 0
35
+ PRE_NORM: False
36
+ ENFORCE_INPUT_PROJ: False
37
+ SIZE_DIVISIBILITY: 32
38
+ ENC_LAYERS: 0
39
+ CLASS_DEC_LAYERS: 2
40
+ DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
41
+ TRAIN_NUM_POINTS: 12544
42
+ OVERSAMPLE_RATIO: 3.0
43
+ IMPORTANCE_SAMPLE_RATIO: 0.75
44
+ TEXT_ENCODER:
45
+ WIDTH: 256
46
+ CONTEXT_LENGTH: 77
47
+ NUM_LAYERS: 6
48
+ VOCAB_SIZE: 49408
49
+ PROJ_NUM_LAYERS: 2
50
+ N_CTX: 16
51
+ TEST:
52
+ SEMANTIC_ON: True
53
+ INSTANCE_ON: True
54
+ PANOPTIC_ON: True
55
+ OVERLAP_THRESHOLD: 0.8
56
+ OBJECT_MASK_THRESHOLD: 0.8
57
+ TASK: "panoptic"
58
+ TEST:
59
+ DETECTIONS_PER_IMAGE: 150
annotator/OneFormer/configs/mapillary_vistas/swin/oneformer_swin_large_bs16_300k.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _BASE_: ../oneformer_R50_bs16_300k.yaml
2
+ MODEL:
3
+ BACKBONE:
4
+ NAME: "D2SwinTransformer"
5
+ SWIN:
6
+ EMBED_DIM: 192
7
+ DEPTHS: [2, 2, 18, 2]
8
+ NUM_HEADS: [6, 12, 24, 48]
9
+ WINDOW_SIZE: 12
10
+ APE: False
11
+ DROP_PATH_RATE: 0.3
12
+ PATCH_NORM: True
13
+ PRETRAIN_IMG_SIZE: 384
14
+ WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl"
15
+ PIXEL_MEAN: [123.675, 116.280, 103.530]
16
+ PIXEL_STD: [58.395, 57.120, 57.375]
17
+ ONE_FORMER:
18
+ NUM_OBJECT_QUERIES: 250
19
+ TEST:
20
+ DETECTIONS_PER_IMAGE: 250
annotator/OneFormer/datasets/README.md ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Prepare Datasets for OneFormer
2
+
3
+ - A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog) for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc).
4
+ - This document explains how to setup the builtin datasets so they can be used by the above APIs. [Training OneFormer with Custom Datasets](https://github.com/SHI-Labs/OneFormer/tree/main/datasets/custom_datasets) gives a deeper dive on how to train OneFormer with custom datasets.
5
+ - Detectron2 has builtin support for a few datasets. The datasets are assumed to exist in a directory specified by the environment variable `DETECTRON2_DATASETS`. Under this directory, detectron2 will look for datasets in the structure described below, if needed.
6
+
7
+ ```text
8
+ $DETECTRON2_DATASETS/
9
+ ADEChallengeData2016/
10
+ cityscapes/
11
+ coco/
12
+ mapillary_vistas/
13
+ ```
14
+
15
+ - You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`. If left unset, the default is `./datasets` relative to your current working directory.
16
+
17
+
18
+ ## Expected dataset structure for [ADE20K](http://sceneparsing.csail.mit.edu/)
19
+
20
+ ```text
21
+ ADEChallengeData2016/
22
+ images/
23
+ annotations/
24
+ objectInfo150.txt
25
+ # download instance annotation
26
+ annotations_instance/
27
+ # generated by prepare_ade20k_sem_seg.py
28
+ annotations_detectron2/
29
+ # below are generated by prepare_ade20k_pan_seg.py
30
+ ade20k_panoptic_{train,val}.json
31
+ ade20k_panoptic_{train,val}/
32
+ # below are generated by prepare_ade20k_ins_seg.py
33
+ ade20k_instance_{train,val}.json
34
+ ```
35
+
36
+ - Generate `annotations_detectron2`:
37
+
38
+ ```bash
39
+ python datasets/prepare_ade20k_sem_seg.py
40
+ ```
41
+
42
+ - Install panopticapi by:
43
+
44
+ ```bash
45
+ pip install git+https://github.com/cocodataset/panopticapi.git
46
+ ```
47
+
48
+ - Download the instance annotation from <http://sceneparsing.csail.mit.edu/>:
49
+
50
+ ```bash
51
+ wget http://sceneparsing.csail.mit.edu/data/ChallengeData2017/annotations_instance.tar
52
+ ```
53
+
54
+ - Then, run `python datasets/prepare_ade20k_pan_seg.py`, to combine semantic and instance annotations for panoptic annotations.
55
+
56
+ - Run `python datasets/prepare_ade20k_ins_seg.py`, to extract instance annotations in COCO format.
57
+
58
+ ## Expected dataset structure for [Cityscapes](https://www.cityscapes-dataset.com/downloads/)
59
+
60
+ ```text
61
+ cityscapes/
62
+ gtFine/
63
+ train/
64
+ aachen/
65
+ color.png, instanceIds.png, labelIds.png, polygons.json,
66
+ labelTrainIds.png
67
+ ...
68
+ val/
69
+ test/
70
+ # below are generated Cityscapes panoptic annotation
71
+ cityscapes_panoptic_train.json
72
+ cityscapes_panoptic_train/
73
+ cityscapes_panoptic_val.json
74
+ cityscapes_panoptic_val/
75
+ cityscapes_panoptic_test.json
76
+ cityscapes_panoptic_test/
77
+ leftImg8bit/
78
+ train/
79
+ val/
80
+ test/
81
+ ```
82
+
83
+ - Login and download the dataset
84
+
85
+ ```bash
86
+ wget --keep-session-cookies --save-cookies=cookies.txt --post-data 'username=myusername&password=mypassword&submit=Login' https://www.cityscapes-dataset.com/login/
87
+ ######## gtFine
88
+ wget --load-cookies cookies.txt --content-disposition https://www.cityscapes-dataset.com/file-handling/?packageID=1
89
+ ######## leftImg8bit
90
+ wget --load-cookies cookies.txt --content-disposition https://www.cityscapes-dataset.com/file-handling/?packageID=3
91
+ ```
92
+
93
+ - Install cityscapes scripts by:
94
+
95
+ ```bash
96
+ pip install git+https://github.com/mcordts/cityscapesScripts.git
97
+ ```
98
+
99
+ - To create labelTrainIds.png, first prepare the above structure, then run cityscapesescript with:
100
+
101
+ ```bash
102
+ git clone https://github.com/mcordts/cityscapesScripts.git
103
+ ```
104
+
105
+ ```bash
106
+ CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesScripts/cityscapesscripts/preparation/createTrainIdLabelImgs.py
107
+ ```
108
+
109
+ These files are not needed for instance segmentation.
110
+
111
+ - To generate Cityscapes panoptic dataset, run cityscapesescript with:
112
+
113
+ ```bash
114
+ CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesScripts/cityscapesscripts/preparation/createPanopticImgs.py
115
+ ```
116
+
117
+ These files are not needed for semantic and instance segmentation.
118
+
119
+ ## Expected dataset structure for [COCO](https://cocodataset.org/#download)
120
+
121
+ ```text
122
+ coco/
123
+ annotations/
124
+ instances_{train,val}2017.json
125
+ panoptic_{train,val}2017.json
126
+ caption_{train,val}2017.json
127
+ # evaluate on instance labels derived from panoptic annotations
128
+ panoptic2instances_val2017.json
129
+ {train,val}2017/
130
+ # image files that are mentioned in the corresponding json
131
+ panoptic_{train,val}2017/ # png annotations
132
+ panoptic_semseg_{train,val}2017/ # generated by the script mentioned below
133
+ ```
134
+
135
+ - Install panopticapi by:
136
+
137
+ ```bash
138
+ pip install git+https://github.com/cocodataset/panopticapi.git
139
+ ```
140
+
141
+ - Then, run `python datasets/prepare_coco_semantic_annos_from_panoptic_annos.py`, to extract semantic annotations from panoptic annotations (only used for evaluation).
142
+
143
+ - Then run the following command to convert the panoptic json into instance json format (used for evaluation on instance segmentation task):
144
+
145
+ ```bash
146
+ python datasets/panoptic2detection_coco_format.py --things_only
147
+ ```
148
+
149
+ ## Expected dataset structure for [Mapillary Vistas](https://www.mapillary.com/dataset/vistas)
150
+
151
+ ```text
152
+ mapillary_vistas/
153
+ training/
154
+ images/
155
+ instances/
156
+ labels/
157
+ panoptic/
158
+ validation/
159
+ images/
160
+ instances/
161
+ labels/
162
+ panoptic/
163
+ mapillary_vistas_instance_{train,val}.json # generated by the script mentioned below
164
+ ```
165
+
166
+ No preprocessing is needed for Mapillary Vistas on semantic and panoptic segmentation.
167
+
168
+ We do not evaluate for the instance segmentation task on the Mapillary Vistas dataset.
annotator/OneFormer/datasets/ade20k_instance_catid_mapping.txt ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Instacne100 SceneParse150 FullADE20K
2
+ 1 8 165
3
+ 2 9 3055
4
+ 3 11 350
5
+ 4 13 1831
6
+ 5 15 774
7
+ 5 15 783
8
+ 6 16 2684
9
+ 7 19 687
10
+ 8 20 471
11
+ 9 21 401
12
+ 10 23 1735
13
+ 11 24 2473
14
+ 12 25 2329
15
+ 13 28 1564
16
+ 14 31 57
17
+ 15 32 2272
18
+ 16 33 907
19
+ 17 34 724
20
+ 18 36 2985
21
+ 18 36 533
22
+ 19 37 1395
23
+ 20 38 155
24
+ 21 39 2053
25
+ 22 40 689
26
+ 23 42 266
27
+ 24 43 581
28
+ 25 44 2380
29
+ 26 45 491
30
+ 27 46 627
31
+ 28 48 2388
32
+ 29 50 943
33
+ 30 51 2096
34
+ 31 54 2530
35
+ 32 56 420
36
+ 33 57 1948
37
+ 34 58 1869
38
+ 35 59 2251
39
+ 36 63 239
40
+ 37 65 571
41
+ 38 66 2793
42
+ 39 67 978
43
+ 40 68 236
44
+ 41 70 181
45
+ 42 71 629
46
+ 43 72 2598
47
+ 44 73 1744
48
+ 45 74 1374
49
+ 46 75 591
50
+ 47 76 2679
51
+ 48 77 223
52
+ 49 79 47
53
+ 50 81 327
54
+ 51 82 2821
55
+ 52 83 1451
56
+ 53 84 2880
57
+ 54 86 480
58
+ 55 87 77
59
+ 56 88 2616
60
+ 57 89 246
61
+ 57 89 247
62
+ 58 90 2733
63
+ 59 91 14
64
+ 60 93 38
65
+ 61 94 1936
66
+ 62 96 120
67
+ 63 98 1702
68
+ 64 99 249
69
+ 65 103 2928
70
+ 66 104 2337
71
+ 67 105 1023
72
+ 68 108 2989
73
+ 69 109 1930
74
+ 70 111 2586
75
+ 71 112 131
76
+ 72 113 146
77
+ 73 116 95
78
+ 74 117 1563
79
+ 75 119 1708
80
+ 76 120 103
81
+ 77 121 1002
82
+ 78 122 2569
83
+ 79 124 2833
84
+ 80 125 1551
85
+ 81 126 1981
86
+ 82 127 29
87
+ 83 128 187
88
+ 84 130 747
89
+ 85 131 2254
90
+ 86 133 2262
91
+ 87 134 1260
92
+ 88 135 2243
93
+ 89 136 2932
94
+ 90 137 2836
95
+ 91 138 2850
96
+ 92 139 64
97
+ 93 140 894
98
+ 94 143 1919
99
+ 95 144 1583
100
+ 96 145 318
101
+ 97 147 2046
102
+ 98 148 1098
103
+ 99 149 530
104
+ 100 150 954
annotator/OneFormer/datasets/custom_datasets/README.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Training OneFormer with Custom Datasets
2
+
3
+ OneFormer advocates the usage of panoptic annotations along with its task-conditioned joint training strategy. However, if panoptic annotations are not available, then also OneFormer can be trained using only the instance or semantic annotations on custom datasets. We provide some guidelines for training with custom datasets.
4
+
5
+ ## Register your New Dataset
6
+
7
+ - OneFormer uses the information (class names, thing classes, etc.) stored in a dataset's metadata while preparing a dataset dictionary using a [`dataset_mapper`](https://github.com/SHI-Labs/OneFormer/tree/main/oneformer/data/dataset_mappers).
8
+
9
+ - [Use Custom Datasets](https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html) gives a deeper dive into registering a new custom dataset.
10
+
11
+ ## Training with Available Panoptic Annotations
12
+
13
+ - To prepare the dataset dictionary for each iteration during training, OneFormer uses a [`dataset_mapper`](https://github.com/SHI-Labs/OneFormer/tree/main/oneformer/data/dataset_mappers) class.
14
+
15
+ - Originally, we provide two `dataset_mapper` classes which support task-conditioned joint training using the panoptic annotations:
16
+ - [`COCOUnifiedNewBaselineDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/5e04c9aaffd9bc73020d2238757f62346fe778c0/oneformer/data/dataset_mappers/coco_unified_new_baseline_dataset_mapper.py#L56): Specifically designed for COCO annotation format.
17
+ - [`OneFormerUnifiedDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/5e04c9aaffd9bc73020d2238757f62346fe778c0/oneformer/data/dataset_mappers/oneformer_unified_dataset_mapper.py#L26): General annotation format.
18
+
19
+ - If you have panoptic annotations for your custom dataset, you may use these dataset_mapper classes directly after registering your dataset. You may also tune the [task sampling probabilities in the corresponding config file](https://github.com/SHI-Labs/OneFormer/blob/5e04c9aaffd9bc73020d2238757f62346fe778c0/configs/ade20k/Base-ADE20K-UnifiedSegmentation.yaml#L55).
20
+
21
+ - If you want to train using only the instance or semantic annotation, please follow the next section on preparing a custom dataset mapper class.
22
+
23
+ ## Write a Custom Dataset Mapper Class
24
+
25
+ - If you want to train using only instance or semantic annotations, write your custom dataset mapper class and add it to the [`build_train_loader`](https://github.com/SHI-Labs/OneFormer/blob/5e04c9aaffd9bc73020d2238757f62346fe778c0/train_net.py#L156) method.
26
+
27
+ - We provide a few templates for custom dataset mappers:
28
+ - [`InstanceCOCOCustomNewBaselineDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/a7fae86ce5791a93132c059c1bdfc79c9f842820/datasets/custom_datasets/instance_coco_custom_dataset_mapper.py#L72): Specifically designed for COCO instance annotation format.
29
+ - [`InstanceOneFormerCustomDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/a7fae86ce5791a93132c059c1bdfc79c9f842820/datasets/custom_datasets/instance_oneformer_custom_dataset_mapper.py#L26): General instance annotation format.
30
+ - [`SemanticOneFormerCustomDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/a7fae86ce5791a93132c059c1bdfc79c9f842820/datasets/custom_datasets/semantic_oneformer_custom_dataset_mapper.py#L26): General semantic annotation format.
31
+
32
+ - Remember to register your custom dataset before training.
33
+
34
+
35
+ Now you are all set to train OneFormer using your custom dataset!
annotator/OneFormer/datasets/custom_datasets/instance_coco_custom_dataset_mapper.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------
2
+ # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py
3
+ # Modified by Jitesh Jain (https://github.com/praeclarumjj3)
4
+ # ------------------------------------------------------------------------------
5
+
6
+ import copy
7
+ import logging
8
+
9
+ import numpy as np
10
+ import torch
11
+
12
+ from detectron2.data import MetadataCatalog
13
+ from detectron2.config import configurable
14
+ from detectron2.data import detection_utils as utils
15
+ from detectron2.data import transforms as T
16
+ from oneformer.data.tokenizer import SimpleTokenizer, Tokenize
17
+ from pycocotools import mask as coco_mask
18
+
19
+ __all__ = ["InstanceCOCOCustomNewBaselineDatasetMapper"]
20
+
21
+
22
+ def convert_coco_poly_to_mask(segmentations, height, width):
23
+ masks = []
24
+ for polygons in segmentations:
25
+ rles = coco_mask.frPyObjects(polygons, height, width)
26
+ mask = coco_mask.decode(rles)
27
+ if len(mask.shape) < 3:
28
+ mask = mask[..., None]
29
+ mask = torch.as_tensor(mask, dtype=torch.uint8)
30
+ mask = mask.any(dim=2)
31
+ masks.append(mask)
32
+ if masks:
33
+ masks = torch.stack(masks, dim=0)
34
+ else:
35
+ masks = torch.zeros((0, height, width), dtype=torch.uint8)
36
+ return masks
37
+
38
+
39
+ def build_transform_gen(cfg, is_train):
40
+ """
41
+ Create a list of default :class:`Augmentation` from config.
42
+ Now it includes resizing and flipping.
43
+ Returns:
44
+ list[Augmentation]
45
+ """
46
+ assert is_train, "Only support training augmentation"
47
+ image_size = cfg.INPUT.IMAGE_SIZE
48
+ min_scale = cfg.INPUT.MIN_SCALE
49
+ max_scale = cfg.INPUT.MAX_SCALE
50
+
51
+ augmentation = []
52
+
53
+ if cfg.INPUT.RANDOM_FLIP != "none":
54
+ augmentation.append(
55
+ T.RandomFlip(
56
+ horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
57
+ vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
58
+ )
59
+ )
60
+
61
+ augmentation.extend([
62
+ T.ResizeScale(
63
+ min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
64
+ ),
65
+ T.FixedSizeCrop(crop_size=(image_size, image_size)),
66
+ ])
67
+
68
+ return augmentation
69
+
70
+
71
+ # This is specifically designed for the COCO Instance Segmentation dataset.
72
+ class InstanceCOCOCustomNewBaselineDatasetMapper:
73
+ """
74
+ A callable which takes a dataset dict in Detectron2 Dataset format,
75
+ and map it into a format used by OneFormer for custom instance segmentation using COCO format.
76
+
77
+ The callable currently does the following:
78
+
79
+ 1. Read the image from "file_name"
80
+ 2. Applies geometric transforms to the image and annotation
81
+ 3. Find and applies suitable cropping to the image and annotation
82
+ 4. Prepare image and annotation to Tensors
83
+ """
84
+
85
+ @configurable
86
+ def __init__(
87
+ self,
88
+ is_train=True,
89
+ *,
90
+ num_queries,
91
+ tfm_gens,
92
+ meta,
93
+ image_format,
94
+ max_seq_len,
95
+ task_seq_len,
96
+ ):
97
+ """
98
+ NOTE: this interface is experimental.
99
+ Args:
100
+ is_train: for training or inference
101
+ augmentations: a list of augmentations or deterministic transforms to apply
102
+ crop_gen: crop augmentation
103
+ tfm_gens: data augmentation
104
+ image_format: an image format supported by :func:`detection_utils.read_image`.
105
+ """
106
+ self.tfm_gens = tfm_gens
107
+ logging.getLogger(__name__).info(
108
+ "[InstanceCOCOCustomNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
109
+ str(self.tfm_gens)
110
+ )
111
+ )
112
+
113
+ self.img_format = image_format
114
+ self.is_train = is_train
115
+ self.meta = meta
116
+ self.num_queries = num_queries
117
+
118
+ self.things = []
119
+ for k,v in self.meta.thing_dataset_id_to_contiguous_id.items():
120
+ self.things.append(v)
121
+ self.class_names = self.meta.thing_classes
122
+ self.text_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=max_seq_len)
123
+ self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len)
124
+
125
+ @classmethod
126
+ def from_config(cls, cfg, is_train=True):
127
+ # Build augmentation
128
+ tfm_gens = build_transform_gen(cfg, is_train)
129
+ dataset_names = cfg.DATASETS.TRAIN
130
+ meta = MetadataCatalog.get(dataset_names[0])
131
+
132
+ ret = {
133
+ "is_train": is_train,
134
+ "meta": meta,
135
+ "tfm_gens": tfm_gens,
136
+ "image_format": cfg.INPUT.FORMAT,
137
+ "num_queries": cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES - cfg.MODEL.TEXT_ENCODER.N_CTX,
138
+ "task_seq_len": cfg.INPUT.TASK_SEQ_LEN,
139
+ "max_seq_len": cfg.INPUT.MAX_SEQ_LEN,
140
+ }
141
+ return ret
142
+
143
+ def _get_texts(self, classes, num_class_obj):
144
+
145
+ classes = list(np.array(classes))
146
+ texts = ["an instance photo"] * self.num_queries
147
+
148
+ for class_id in classes:
149
+ cls_name = self.class_names[class_id]
150
+ num_class_obj[cls_name] += 1
151
+
152
+ num = 0
153
+ for i, cls_name in enumerate(self.class_names):
154
+ if num_class_obj[cls_name] > 0:
155
+ for _ in range(num_class_obj[cls_name]):
156
+ if num >= len(texts):
157
+ break
158
+ texts[num] = f"a photo with a {cls_name}"
159
+ num += 1
160
+
161
+ return texts
162
+
163
+ def __call__(self, dataset_dict):
164
+ """
165
+ Args:
166
+ dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
167
+
168
+ Returns:
169
+ dict: a format that builtin models in detectron2 accept
170
+ """
171
+ dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
172
+ image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
173
+ utils.check_image_size(dataset_dict, image)
174
+
175
+ # TODO: get padding mask
176
+ # by feeding a "segmentation mask" to the same transforms
177
+ padding_mask = np.ones(image.shape[:2])
178
+
179
+ image, transforms = T.apply_transform_gens(self.tfm_gens, image)
180
+ # the crop transformation has default padding value 0 for segmentation
181
+ padding_mask = transforms.apply_segmentation(padding_mask)
182
+ padding_mask = ~ padding_mask.astype(bool)
183
+
184
+ image_shape = image.shape[:2] # h, w
185
+
186
+ # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
187
+ # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
188
+ # Therefore it's important to use torch.Tensor.
189
+ dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
190
+ dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))
191
+
192
+ if not self.is_train:
193
+ # USER: Modify this if you want to keep them for some reason.
194
+ dataset_dict.pop("annotations", None)
195
+ return dataset_dict
196
+
197
+ if "annotations" in dataset_dict:
198
+ # USER: Modify this if you want to keep them for some reason.
199
+ for anno in dataset_dict["annotations"]:
200
+ anno.pop("keypoints", None)
201
+
202
+ # USER: Implement additional transformations if you have other types of data
203
+ annos = [
204
+ utils.transform_instance_annotations(obj, transforms, image_shape)
205
+ for obj in dataset_dict.pop("annotations")
206
+ if obj.get("iscrowd", 0) == 0
207
+ ]
208
+
209
+ instances = utils.annotations_to_instances(annos, image_shape)
210
+
211
+ instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
212
+ # Need to filter empty instances first (due to augmentation)
213
+ instances = utils.filter_empty_instances(instances)
214
+ # Generate masks from polygon
215
+ h, w = instances.image_size
216
+ # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
217
+ if hasattr(instances, 'gt_masks'):
218
+ gt_masks = instances.gt_masks
219
+ gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
220
+ instances.gt_masks = gt_masks
221
+
222
+ num_class_obj = {}
223
+ for name in self.class_names:
224
+ num_class_obj[name] = 0
225
+
226
+ task = "The task is instance"
227
+ text = self._get_texts(instances.gt_classes, num_class_obj)
228
+
229
+ dataset_dict["instances"] = instances
230
+ dataset_dict["orig_shape"] = image_shape
231
+ dataset_dict["task"] = task
232
+ dataset_dict["text"] = text
233
+ dataset_dict["thing_ids"] = self.things
234
+
235
+ return dataset_dict
annotator/OneFormer/datasets/custom_datasets/instance_oneformer_custom_dataset_mapper.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------
2
+ # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/mask_former_instance_dataset_mapper.py
3
+ # Modified by Jitesh Jain (https://github.com/praeclarumjj3)
4
+ # ------------------------------------------------------------------------------
5
+
6
+ import copy
7
+ import logging
8
+ import os
9
+
10
+ import numpy as np
11
+ import torch
12
+ from torch.nn import functional as F
13
+
14
+ from detectron2.config import configurable
15
+ from detectron2.data import detection_utils as utils
16
+ from detectron2.data import transforms as T
17
+ from detectron2.structures import BitMasks, Instances, polygons_to_bitmask
18
+ from detectron2.data import MetadataCatalog
19
+ from detectron2.projects.point_rend import ColorAugSSDTransform
20
+ from oneformer.data.tokenizer import SimpleTokenizer, Tokenize
21
+ import pycocotools.mask as mask_util
22
+
23
+ __all__ = ["InstanceOneFormerCustomDatasetMapper"]
24
+
25
+
26
+ class InstanceOneFormerCustomDatasetMapper:
27
+ """
28
+ A callable which takes a dataset dict in Detectron2 Dataset format,
29
+ and map it into a format used by OneFormer custom instance segmentation.
30
+
31
+ The callable currently does the following:
32
+
33
+ 1. Read the image from "file_name"
34
+ 2. Applies geometric transforms to the image and annotation
35
+ 3. Find and applies suitable cropping to the image and annotation
36
+ 4. Prepare image and annotation to Tensors
37
+ """
38
+
39
+ @configurable
40
+ def __init__(
41
+ self,
42
+ is_train=True,
43
+ *,
44
+ name,
45
+ num_queries,
46
+ meta,
47
+ augmentations,
48
+ image_format,
49
+ size_divisibility,
50
+ task_seq_len,
51
+ max_seq_len,
52
+ ):
53
+ """
54
+ NOTE: this interface is experimental.
55
+ Args:
56
+ is_train: for training or inference
57
+ augmentations: a list of augmentations or deterministic transforms to apply
58
+ image_format: an image format supported by :func:`detection_utils.read_image`.
59
+ ignore_label: the label that is ignored to evaluation
60
+ size_divisibility: pad image size to be divisible by this value
61
+ """
62
+ self.is_train = is_train
63
+ self.meta = meta
64
+ self.name = name
65
+ self.tfm_gens = augmentations
66
+ self.img_format = image_format
67
+ self.size_divisibility = size_divisibility
68
+ self.num_queries = num_queries
69
+
70
+ logger = logging.getLogger(__name__)
71
+ mode = "training" if is_train else "inference"
72
+ logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
73
+
74
+ self.things = []
75
+ for k,v in self.meta.thing_dataset_id_to_contiguous_id.items():
76
+ self.things.append(v)
77
+ self.class_names = self.meta.thing_classes
78
+ self.text_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=max_seq_len)
79
+ self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len)
80
+
81
+ @classmethod
82
+ def from_config(cls, cfg, is_train=True):
83
+ # Build augmentation
84
+ augs = [
85
+ T.ResizeShortestEdge(
86
+ cfg.INPUT.MIN_SIZE_TRAIN,
87
+ cfg.INPUT.MAX_SIZE_TRAIN,
88
+ cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
89
+ )
90
+ ]
91
+ if cfg.INPUT.CROP.ENABLED:
92
+ augs.append(
93
+ T.RandomCrop(
94
+ cfg.INPUT.CROP.TYPE,
95
+ cfg.INPUT.CROP.SIZE,
96
+ )
97
+ )
98
+ if cfg.INPUT.COLOR_AUG_SSD:
99
+ augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
100
+ augs.append(T.RandomFlip())
101
+
102
+ # Assume always applies to the training set.
103
+ dataset_names = cfg.DATASETS.TRAIN
104
+ meta = MetadataCatalog.get(dataset_names[0])
105
+
106
+ ret = {
107
+ "is_train": is_train,
108
+ "meta": meta,
109
+ "name": dataset_names[0],
110
+ "num_queries": cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES - cfg.MODEL.TEXT_ENCODER.N_CTX,
111
+ "task_seq_len": cfg.INPUT.TASK_SEQ_LEN,
112
+ "max_seq_len": cfg.INPUT.MAX_SEQ_LEN,
113
+ "augmentations": augs,
114
+ "image_format": cfg.INPUT.FORMAT,
115
+ "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
116
+ "semantic_prob": cfg.INPUT.TASK_PROB.SEMANTIC,
117
+ "instance_prob": cfg.INPUT.TASK_PROB.INSTANCE,
118
+ }
119
+ return ret
120
+
121
+ def _get_texts(self, classes, num_class_obj):
122
+
123
+ classes = list(np.array(classes))
124
+ texts = ["an instance photo"] * self.num_queries
125
+
126
+ for class_id in classes:
127
+ cls_name = self.class_names[class_id]
128
+ num_class_obj[cls_name] += 1
129
+
130
+ num = 0
131
+ for i, cls_name in enumerate(self.class_names):
132
+ if num_class_obj[cls_name] > 0:
133
+ for _ in range(num_class_obj[cls_name]):
134
+ if num >= len(texts):
135
+ break
136
+ texts[num] = f"a photo with a {cls_name}"
137
+ num += 1
138
+
139
+ return texts
140
+
141
+ def __call__(self, dataset_dict):
142
+ """
143
+ Args:
144
+ dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
145
+
146
+ Returns:
147
+ dict: a format that builtin models in detectron2 accept
148
+ """
149
+ assert self.is_train, "OneFormerDatasetMapper should only be used for training!"
150
+
151
+ dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
152
+ image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
153
+ utils.check_image_size(dataset_dict, image)
154
+
155
+ aug_input = T.AugInput(image)
156
+ aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
157
+ image = aug_input.image
158
+
159
+ # transform instnace masks
160
+ assert "annotations" in dataset_dict
161
+ for anno in dataset_dict["annotations"]:
162
+ anno.pop("keypoints", None)
163
+
164
+ annos = [
165
+ utils.transform_instance_annotations(obj, transforms, image.shape[:2])
166
+ for obj in dataset_dict.pop("annotations")
167
+ if obj.get("iscrowd", 0) == 0
168
+ ]
169
+
170
+ if len(annos):
171
+ assert "segmentation" in annos[0]
172
+ segms = [obj["segmentation"] for obj in annos]
173
+ masks = []
174
+ for segm in segms:
175
+ if isinstance(segm, list):
176
+ # polygon
177
+ masks.append(polygons_to_bitmask(segm, *image.shape[:2]))
178
+ elif isinstance(segm, dict):
179
+ # COCO RLE
180
+ masks.append(mask_util.decode(segm))
181
+ elif isinstance(segm, np.ndarray):
182
+ assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
183
+ segm.ndim
184
+ )
185
+ # mask array
186
+ masks.append(segm)
187
+ else:
188
+ raise ValueError(
189
+ "Cannot convert segmentation of type '{}' to BitMasks!"
190
+ "Supported types are: polygons as list[list[float] or ndarray],"
191
+ " COCO-style RLE as a dict, or a binary segmentation mask "
192
+ " in a 2D numpy array of shape HxW.".format(type(segm))
193
+ )
194
+
195
+ # Pad image and segmentation label here!
196
+ image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
197
+ masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks]
198
+
199
+ classes = [int(obj["category_id"]) for obj in annos]
200
+ classes = torch.tensor(classes, dtype=torch.int64)
201
+
202
+ if self.size_divisibility > 0:
203
+ image_size = (image.shape[-2], image.shape[-1])
204
+ padding_size = [
205
+ 0,
206
+ self.size_divisibility - image_size[1],
207
+ 0,
208
+ self.size_divisibility - image_size[0],
209
+ ]
210
+ # pad image
211
+ image = F.pad(image, padding_size, value=128).contiguous()
212
+ # pad mask
213
+ masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks]
214
+
215
+ image_shape = (image.shape[-2], image.shape[-1]) # h, w
216
+
217
+ # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
218
+ # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
219
+ # Therefore it's important to use torch.Tensor.
220
+ dataset_dict["image"] = image
221
+
222
+ # Prepare per-category binary masks
223
+ instances = Instances(image_shape)
224
+ instances.gt_classes = classes
225
+ if len(masks) == 0:
226
+ # Some image does not have annotation (all ignored)
227
+ instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1]))
228
+ else:
229
+ masks = BitMasks(torch.stack(masks))
230
+ instances.gt_masks = masks.tensor
231
+
232
+ num_class_obj = {}
233
+ for name in self.class_names:
234
+ num_class_obj[name] = 0
235
+
236
+ task = "The task is instance"
237
+ text = self._get_texts(instances.gt_classes, num_class_obj)
238
+
239
+ dataset_dict["instances"] = instances
240
+ dataset_dict["orig_shape"] = image_shape
241
+ dataset_dict["task"] = task
242
+ dataset_dict["text"] = text
243
+ dataset_dict["thing_ids"] = self.things
244
+
245
+ return dataset_dict
annotator/OneFormer/datasets/custom_datasets/semantic_oneformer_custom_dataset_mapper.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------
2
+ # Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/mask_former_semantic_dataset_mapper.py
3
+ # Modified by Jitesh Jain (https://github.com/praeclarumjj3)
4
+ # ------------------------------------------------------------------------------
5
+
6
+ import copy
7
+ import logging
8
+ import os
9
+
10
+ import numpy as np
11
+ import torch
12
+ from torch.nn import functional as F
13
+
14
+ from detectron2.config import configurable
15
+ from detectron2.data import detection_utils as utils
16
+ from detectron2.data import transforms as T
17
+ from detectron2.structures import BitMasks, Instances, polygons_to_bitmask
18
+ from detectron2.data import MetadataCatalog
19
+ from detectron2.projects.point_rend import ColorAugSSDTransform
20
+ from oneformer.data.tokenizer import SimpleTokenizer, Tokenize
21
+ import pycocotools.mask as mask_util
22
+
23
+ __all__ = ["SemanticOneFormerCustomDatasetMapper"]
24
+
25
+
26
+ class SemanticOneFormerCustomDatasetMapper:
27
+ """
28
+ A callable which takes a dataset dict in Detectron2 Dataset format,
29
+ and map it into a format used by OneFormer custom semantic segmentation.
30
+
31
+ The callable currently does the following:
32
+
33
+ 1. Read the image from "file_name"
34
+ 2. Applies geometric transforms to the image and annotation
35
+ 3. Find and applies suitable cropping to the image and annotation
36
+ 4. Prepare image and annotation to Tensors
37
+ """
38
+
39
+ @configurable
40
+ def __init__(
41
+ self,
42
+ is_train=True,
43
+ *,
44
+ name,
45
+ num_queries,
46
+ meta,
47
+ augmentations,
48
+ image_format,
49
+ ignore_label,
50
+ size_divisibility,
51
+ task_seq_len,
52
+ max_seq_len,
53
+ ):
54
+ """
55
+ NOTE: this interface is experimental.
56
+ Args:
57
+ is_train: for training or inference
58
+ augmentations: a list of augmentations or deterministic transforms to apply
59
+ image_format: an image format supported by :func:`detection_utils.read_image`.
60
+ ignore_label: the label that is ignored to evaluation
61
+ size_divisibility: pad image size to be divisible by this value
62
+ """
63
+ self.is_train = is_train
64
+ self.meta = meta
65
+ self.name = name
66
+ self.tfm_gens = augmentations
67
+ self.img_format = image_format
68
+ self.ignore_label = ignore_label
69
+ self.size_divisibility = size_divisibility
70
+ self.num_queries = num_queries
71
+
72
+ logger = logging.getLogger(__name__)
73
+ mode = "training" if is_train else "inference"
74
+ logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
75
+
76
+ self.class_names = self.meta.stuff_classes
77
+ self.text_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=max_seq_len)
78
+ self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len)
79
+
80
+ @classmethod
81
+ def from_config(cls, cfg, is_train=True):
82
+ # Build augmentation
83
+ augs = [
84
+ T.ResizeShortestEdge(
85
+ cfg.INPUT.MIN_SIZE_TRAIN,
86
+ cfg.INPUT.MAX_SIZE_TRAIN,
87
+ cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
88
+ )
89
+ ]
90
+ if cfg.INPUT.CROP.ENABLED:
91
+ augs.append(
92
+ T.RandomCrop_CategoryAreaConstraint(
93
+ cfg.INPUT.CROP.TYPE,
94
+ cfg.INPUT.CROP.SIZE,
95
+ cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA,
96
+ cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
97
+ )
98
+ )
99
+ if cfg.INPUT.COLOR_AUG_SSD:
100
+ augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
101
+ augs.append(T.RandomFlip())
102
+
103
+ # Assume always applies to the training set.
104
+ dataset_names = cfg.DATASETS.TRAIN
105
+ meta = MetadataCatalog.get(dataset_names[0])
106
+ ignore_label = meta.ignore_label
107
+
108
+ ret = {
109
+ "is_train": is_train,
110
+ "meta": meta,
111
+ "name": dataset_names[0],
112
+ "num_queries": cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES - cfg.MODEL.TEXT_ENCODER.N_CTX,
113
+ "task_seq_len": cfg.INPUT.TASK_SEQ_LEN,
114
+ "max_seq_len": cfg.INPUT.MAX_SEQ_LEN,
115
+ "augmentations": augs,
116
+ "image_format": cfg.INPUT.FORMAT,
117
+ "ignore_label": ignore_label,
118
+ "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
119
+ }
120
+ return ret
121
+
122
+ def _get_texts(self, classes, num_class_obj):
123
+
124
+ classes = list(np.array(classes))
125
+ texts = ["an semantic photo"] * self.num_queries
126
+
127
+ for class_id in classes:
128
+ cls_name = self.class_names[class_id]
129
+ num_class_obj[cls_name] += 1
130
+
131
+ num = 0
132
+ for i, cls_name in enumerate(self.class_names):
133
+ if num_class_obj[cls_name] > 0:
134
+ for _ in range(num_class_obj[cls_name]):
135
+ if num >= len(texts):
136
+ break
137
+ texts[num] = f"a photo with a {cls_name}"
138
+ num += 1
139
+
140
+ return texts
141
+
142
+ def __call__(self, dataset_dict):
143
+ """
144
+ Args:
145
+ dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
146
+
147
+ Returns:
148
+ dict: a format that builtin models in detectron2 accept
149
+ """
150
+ assert self.is_train, "SemanticOneFormerCustomDatasetMapper should only be used for training!"
151
+
152
+ dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below
153
+ image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
154
+ utils.check_image_size(dataset_dict, image)
155
+
156
+ if "sem_seg_file_name" in dataset_dict:
157
+ # PyTorch transformation not implemented for uint16, so converting it to double first
158
+ sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
159
+ else:
160
+ sem_seg_gt = None
161
+
162
+ if sem_seg_gt is None:
163
+ raise ValueError(
164
+ "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format(
165
+ dataset_dict["file_name"]
166
+ )
167
+ )
168
+
169
+ aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
170
+ aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
171
+ image = aug_input.image
172
+ sem_seg_gt = aug_input.sem_seg
173
+
174
+ # Pad image and segmentation label here!
175
+ image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
176
+ if sem_seg_gt is not None:
177
+ sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
178
+
179
+ if self.size_divisibility > 0:
180
+ image_size = (image.shape[-2], image.shape[-1])
181
+ padding_size = [
182
+ 0,
183
+ self.size_divisibility - image_size[1],
184
+ 0,
185
+ self.size_divisibility - image_size[0],
186
+ ]
187
+ image = F.pad(image, padding_size, value=128).contiguous()
188
+ if sem_seg_gt is not None:
189
+ sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
190
+
191
+ image_shape = (image.shape[-2], image.shape[-1]) # h, w
192
+
193
+ # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
194
+ # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
195
+ # Therefore it's important to use torch.Tensor.
196
+ dataset_dict["image"] = image
197
+
198
+ if sem_seg_gt is not None:
199
+ dataset_dict["sem_seg"] = sem_seg_gt.long()
200
+
201
+ if "annotations" in dataset_dict:
202
+ raise ValueError("Semantic segmentation dataset should not have 'annotations'.")
203
+
204
+ # Prepare per-category binary masks
205
+ if sem_seg_gt is not None:
206
+ sem_seg_gt = sem_seg_gt.numpy()
207
+ instances = Instances(image_shape)
208
+ classes = np.unique(sem_seg_gt)
209
+ # remove ignored region
210
+ classes = classes[classes != self.ignore_label]
211
+ instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
212
+
213
+ masks = []
214
+ for class_id in classes:
215
+ masks.append(sem_seg_gt == class_id)
216
+
217
+ if len(masks) == 0:
218
+ # Some image does not have annotation (all ignored)
219
+ instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1]))
220
+ else:
221
+ masks = BitMasks(
222
+ torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
223
+ )
224
+ instances.gt_masks = masks.tensor
225
+
226
+ num_class_obj = {}
227
+ for name in self.class_names:
228
+ num_class_obj[name] = 0
229
+
230
+ task = "The task is semantic"
231
+ text = self._get_texts(instances.gt_classes, num_class_obj)
232
+
233
+ dataset_dict["instances"] = instances
234
+ dataset_dict["orig_shape"] = image_shape
235
+ dataset_dict["task"] = task
236
+ dataset_dict["text"] = text
237
+
238
+ return dataset_dict
annotator/OneFormer/datasets/fg_ids.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ADE20K_FG_IDS = {
2
+ 1: 8,
3
+ 2: 9,
4
+ 3: 11,
5
+ 4: 13,
6
+ 5: 15,
7
+ 5: 15,
8
+ 6: 16,
9
+ 7: 19,
10
+ 8: 20,
11
+ 9: 21,
12
+ 10: 23,
13
+ 11: 24,
14
+ 12: 25,
15
+ 13: 28,
16
+ 14: 31,
17
+ 15: 32,
18
+ 16: 33,
19
+ 17: 34,
20
+ 18: 36,
21
+ 18: 36,
22
+ 19: 37,
23
+ 20: 38,
24
+ 21: 39,
25
+ 22: 40,
26
+ 23: 42,
27
+ 24: 43,
28
+ 25: 44,
29
+ 26: 45,
30
+ 27: 46,
31
+ 28: 48,
32
+ 29: 50,
33
+ 30: 51,
34
+ 31: 54,
35
+ 32: 56,
36
+ 33: 57,
37
+ 34: 58,
38
+ 35: 59,
39
+ 36: 63,
40
+ 37: 65,
41
+ 38: 66,
42
+ 39: 67,
43
+ 40: 68,
44
+ 41: 70,
45
+ 42: 71,
46
+ 43: 72,
47
+ 44: 73,
48
+ 45: 74,
49
+ 46: 75,
50
+ 47: 76,
51
+ 48: 77,
52
+ 49: 79,
53
+ 50: 81,
54
+ 51: 82,
55
+ 52: 83,
56
+ 53: 84,
57
+ 54: 86,
58
+ 55: 87,
59
+ 56: 88,
60
+ 57: 89,
61
+ 57: 89,
62
+ 58: 90,
63
+ 59: 91,
64
+ 60: 93,
65
+ 61: 94,
66
+ 62: 96,
67
+ 63: 98,
68
+ 64: 99,
69
+ 65: 103,
70
+ 66: 104,
71
+ 67: 105,
72
+ 68: 108,
73
+ 69: 109,
74
+ 70: 111,
75
+ 71: 112,
76
+ 72: 113,
77
+ 73: 116,
78
+ 74: 117,
79
+ 75: 119,
80
+ 76: 120,
81
+ 77: 121,
82
+ 78: 122,
83
+ 79: 124,
84
+ 80: 125,
85
+ 81: 126,
86
+ 82: 127,
87
+ 83: 128,
88
+ 84: 130,
89
+ 85: 131,
90
+ 86: 133,
91
+ 87: 134,
92
+ 88: 135,
93
+ 89: 136,
94
+ 90: 137,
95
+ 91: 138,
96
+ 92: 139,
97
+ 93: 140,
98
+ 94: 143,
99
+ 95: 144,
100
+ 96: 145,
101
+ 97: 147,
102
+ 98: 148,
103
+ 99: 149,
104
+ 100: 150
105
+ }
106
+
107
+
108
+ CITYSCAPES_FG_NAMES = ['person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle']
annotator/OneFormer/datasets/panoptic2detection_coco_format.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # ------------------------------------------------------------------------------
3
+ # Reference: https://github.com/cocodataset/panopticapi/blob/master/converters/panoptic2detection_coco_format.py
4
+ # Modified by Jitesh Jain (https://github.com/praeclarumjj3)
5
+ # ------------------------------------------------------------------------------
6
+ '''
7
+ This script converts panoptic COCO format to detection COCO format. More
8
+ information about the formats can be found here:
9
+ http://cocodataset.org/#format-data. All segments will be stored in RLE format.
10
+
11
+ Additional option:
12
+ - using option '--things_only' the script can discard all stuff
13
+ segments, saving segments of things classes only.
14
+ '''
15
+ from __future__ import absolute_import
16
+ from __future__ import division
17
+ from __future__ import print_function
18
+ from __future__ import unicode_literals
19
+ import os, sys
20
+ import argparse
21
+ import numpy as np
22
+ import json
23
+ import time
24
+ import multiprocessing
25
+
26
+ import PIL.Image as Image
27
+
28
+ from panopticapi.utils import get_traceback, rgb2id, save_json
29
+
30
+ try:
31
+ # set up path for pycocotools
32
+ # sys.path.append('./cocoapi-master/PythonAPI/')
33
+ from pycocotools import mask as COCOmask
34
+ except Exception:
35
+ raise Exception("Please install pycocotools module from https://github.com/cocodataset/cocoapi")
36
+
37
+ @get_traceback
38
+ def convert_panoptic_to_detection_coco_format_single_core(
39
+ proc_id, annotations_set, categories, segmentations_folder, things_only
40
+ ):
41
+ annotations_detection = []
42
+ for working_idx, annotation in enumerate(annotations_set):
43
+ if working_idx % 100 == 0:
44
+ print('Core: {}, {} from {} images processed'.format(proc_id,
45
+ working_idx,
46
+ len(annotations_set)))
47
+
48
+ file_name = '{}.png'.format(annotation['file_name'].rsplit('.')[0])
49
+ try:
50
+ pan_format = np.array(
51
+ Image.open(os.path.join(segmentations_folder, file_name)), dtype=np.uint32
52
+ )
53
+ except IOError:
54
+ raise KeyError('no prediction png file for id: {}'.format(annotation['image_id']))
55
+ pan = rgb2id(pan_format)
56
+
57
+ for segm_info in annotation['segments_info']:
58
+ if things_only and categories[segm_info['category_id']]['isthing'] != 1:
59
+ continue
60
+ mask = (pan == segm_info['id']).astype(np.uint8)
61
+ mask = np.expand_dims(mask, axis=2)
62
+ segm_info.pop('id')
63
+ segm_info['image_id'] = annotation['image_id']
64
+ rle = COCOmask.encode(np.asfortranarray(mask))[0]
65
+ rle['counts'] = rle['counts'].decode('utf8')
66
+ segm_info['segmentation'] = rle
67
+ annotations_detection.append(segm_info)
68
+
69
+ print('Core: {}, all {} images processed'.format(proc_id, len(annotations_set)))
70
+ return annotations_detection
71
+
72
+
73
+ def convert_panoptic_to_detection_coco_format(input_json_file,
74
+ segmentations_folder,
75
+ output_json_file,
76
+ categories_json_file,
77
+ things_only):
78
+ start_time = time.time()
79
+
80
+ if segmentations_folder is None:
81
+ segmentations_folder = input_json_file.rsplit('.', 1)[0]
82
+
83
+ print("CONVERTING...")
84
+ print("COCO panoptic format:")
85
+ print("\tSegmentation folder: {}".format(segmentations_folder))
86
+ print("\tJSON file: {}".format(input_json_file))
87
+ print("TO")
88
+ print("COCO detection format")
89
+ print("\tJSON file: {}".format(output_json_file))
90
+ if things_only:
91
+ print("Saving only segments of things classes.")
92
+ print('\n')
93
+
94
+ print("Reading annotation information from {}".format(input_json_file))
95
+ with open(input_json_file, 'r') as f:
96
+ d_coco = json.load(f)
97
+ annotations_panoptic = d_coco['annotations']
98
+
99
+ with open(categories_json_file, 'r') as f:
100
+ categories_list = json.load(f)
101
+ categories = {category['id']: category for category in categories_list}
102
+
103
+ cpu_num = multiprocessing.cpu_count()
104
+ annotations_split = np.array_split(annotations_panoptic, cpu_num)
105
+ print("Number of cores: {}, images per core: {}".format(cpu_num, len(annotations_split[0])))
106
+ workers = multiprocessing.Pool(processes=cpu_num)
107
+ processes = []
108
+ for proc_id, annotations_set in enumerate(annotations_split):
109
+ p = workers.apply_async(convert_panoptic_to_detection_coco_format_single_core,
110
+ (proc_id, annotations_set, categories, segmentations_folder, things_only))
111
+ processes.append(p)
112
+ annotations_coco_detection = []
113
+ for p in processes:
114
+ annotations_coco_detection.extend(p.get())
115
+ for idx, ann in enumerate(annotations_coco_detection):
116
+ ann['id'] = idx
117
+
118
+ d_coco['annotations'] = annotations_coco_detection
119
+ categories_coco_detection = []
120
+ for category in d_coco['categories']:
121
+ if things_only and category['isthing'] != 1:
122
+ continue
123
+ category.pop('isthing')
124
+ categories_coco_detection.append(category)
125
+ d_coco['categories'] = categories_coco_detection
126
+ save_json(d_coco, output_json_file)
127
+
128
+ t_delta = time.time() - start_time
129
+ print("Time elapsed: {:0.2f} seconds".format(t_delta))
130
+
131
+
132
+ if __name__ == "__main__":
133
+ parser = argparse.ArgumentParser(
134
+ description="The script converts panoptic COCO format to detection \
135
+ COCO format. See this file's head for more information."
136
+ )
137
+ parser.add_argument('--things_only', action='store_true',
138
+ help="discard stuff classes")
139
+ args = parser.parse_args()
140
+
141
+ _root = os.getenv("DETECTRON2_DATASETS", "datasets")
142
+ root = os.path.join(_root, "coco")
143
+ input_json_file = os.path.join(root, "annotations", "panoptic_val2017.json")
144
+ output_json_file = os.path.join(root, "annotations", "panoptic2instances_val2017.json")
145
+ categories_json_file = "datasets/panoptic_coco_categories.json"
146
+ segmentations_folder = os.path.join(root, "panoptic_val2017")
147
+
148
+ convert_panoptic_to_detection_coco_format(input_json_file,
149
+ segmentations_folder,
150
+ output_json_file,
151
+ categories_json_file,
152
+ args.things_only)
annotator/OneFormer/datasets/prepare_ade20k_ins_seg.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # Copyright (c) Facebook, Inc. and its affiliates.
4
+ import glob
5
+ import json
6
+ import os
7
+ from collections import Counter
8
+
9
+ import numpy as np
10
+ import tqdm
11
+ from panopticapi.utils import IdGenerator, save_json
12
+ from PIL import Image
13
+ import pycocotools.mask as mask_util
14
+
15
+
16
+ if __name__ == "__main__":
17
+ dataset_dir = os.getenv("DETECTRON2_DATASETS", "datasets")
18
+
19
+ for name, dirname in [("train", "training"), ("val", "validation")]:
20
+ image_dir = os.path.join(dataset_dir, f"ADEChallengeData2016/images/{dirname}/")
21
+ instance_dir = os.path.join(
22
+ dataset_dir, f"ADEChallengeData2016/annotations_instance/{dirname}/"
23
+ )
24
+
25
+ # img_id = 0
26
+ ann_id = 1
27
+
28
+ # json
29
+ out_file = os.path.join(dataset_dir, f"ADEChallengeData2016/ade20k_instance_{name}.json")
30
+
31
+ # json config
32
+ instance_config_file = "datasets/ade20k_instance_imgCatIds.json"
33
+ with open(instance_config_file) as f:
34
+ category_dict = json.load(f)["categories"]
35
+
36
+ # load catid mapping
37
+ # it is important to share category id for both instance and panoptic annotations
38
+ mapping_file = "datasets/ade20k_instance_catid_mapping.txt"
39
+ with open(mapping_file) as f:
40
+ map_id = {}
41
+ for i, line in enumerate(f.readlines()):
42
+ if i == 0:
43
+ continue
44
+ ins_id, sem_id, _ = line.strip().split()
45
+ # shift id by 1 because we want it to start from 0!
46
+ # ignore_label becomes 255
47
+ map_id[int(ins_id)] = int(sem_id) - 1
48
+
49
+ for cat in category_dict:
50
+ cat["id"] = map_id[cat["id"]]
51
+
52
+ filenames = sorted(glob.glob(os.path.join(image_dir, "*.jpg")))
53
+
54
+ ann_dict = {}
55
+ images = []
56
+ annotations = []
57
+
58
+ for idx, filename in enumerate(tqdm.tqdm(filenames)):
59
+ image = {}
60
+ image_id = os.path.basename(filename).split(".")[0]
61
+
62
+ image["id"] = image_id
63
+ image["file_name"] = os.path.basename(filename)
64
+
65
+ original_format = np.array(Image.open(filename))
66
+ image["width"] = original_format.shape[1]
67
+ image["height"] = original_format.shape[0]
68
+
69
+ images.append(image)
70
+
71
+ filename_instance = os.path.join(instance_dir, image_id + ".png")
72
+ ins_seg = np.asarray(Image.open(filename_instance))
73
+ assert ins_seg.dtype == np.uint8
74
+
75
+ instance_cat_ids = ins_seg[..., 0]
76
+ # instance id starts from 1!
77
+ # because 0 is reserved as VOID label
78
+ instance_ins_ids = ins_seg[..., 1]
79
+
80
+ # process things
81
+ for thing_id in np.unique(instance_ins_ids):
82
+ if thing_id == 0:
83
+ continue
84
+ mask = instance_ins_ids == thing_id
85
+ instance_cat_id = np.unique(instance_cat_ids[mask])
86
+ assert len(instance_cat_id) == 1
87
+
88
+ anno = {}
89
+ anno['id'] = ann_id
90
+ ann_id += 1
91
+ anno['image_id'] = image['id']
92
+ anno["iscrowd"] = int(0)
93
+ anno["category_id"] = int(map_id[instance_cat_id[0]])
94
+
95
+ inds = np.nonzero(mask)
96
+ ymin, ymax = inds[0].min(), inds[0].max()
97
+ xmin, xmax = inds[1].min(), inds[1].max()
98
+ anno["bbox"] = [int(xmin), int(ymin), int(xmax - xmin + 1), int(ymax - ymin + 1)]
99
+ # if xmax <= xmin or ymax <= ymin:
100
+ # continue
101
+ rle = mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
102
+ rle["counts"] = rle["counts"].decode("utf-8")
103
+ anno["segmentation"] = rle
104
+ anno["area"] = int(mask_util.area(rle))
105
+ annotations.append(anno)
106
+
107
+ # save this
108
+ ann_dict['images'] = images
109
+ ann_dict['categories'] = category_dict
110
+ ann_dict['annotations'] = annotations
111
+
112
+ save_json(ann_dict, out_file)
annotator/OneFormer/datasets/prepare_ade20k_pan_seg.py ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # Copyright (c) Facebook, Inc. and its affiliates.
4
+ import glob
5
+ import json
6
+ import os
7
+ from collections import Counter
8
+
9
+ import numpy as np
10
+ import tqdm
11
+ from panopticapi.utils import IdGenerator, save_json
12
+ from PIL import Image
13
+
14
+ ADE20K_SEM_SEG_CATEGORIES = [
15
+ "wall",
16
+ "building",
17
+ "sky",
18
+ "floor",
19
+ "tree",
20
+ "ceiling",
21
+ "road, route",
22
+ "bed",
23
+ "window ",
24
+ "grass",
25
+ "cabinet",
26
+ "sidewalk, pavement",
27
+ "person",
28
+ "earth, ground",
29
+ "door",
30
+ "table",
31
+ "mountain, mount",
32
+ "plant",
33
+ "curtain",
34
+ "chair",
35
+ "car",
36
+ "water",
37
+ "painting, picture",
38
+ "sofa",
39
+ "shelf",
40
+ "house",
41
+ "sea",
42
+ "mirror",
43
+ "rug",
44
+ "field",
45
+ "armchair",
46
+ "seat",
47
+ "fence",
48
+ "desk",
49
+ "rock, stone",
50
+ "wardrobe, closet, press",
51
+ "lamp",
52
+ "tub",
53
+ "rail",
54
+ "cushion",
55
+ "base, pedestal, stand",
56
+ "box",
57
+ "column, pillar",
58
+ "signboard, sign",
59
+ "chest of drawers, chest, bureau, dresser",
60
+ "counter",
61
+ "sand",
62
+ "sink",
63
+ "skyscraper",
64
+ "fireplace",
65
+ "refrigerator, icebox",
66
+ "grandstand, covered stand",
67
+ "path",
68
+ "stairs",
69
+ "runway",
70
+ "case, display case, showcase, vitrine",
71
+ "pool table, billiard table, snooker table",
72
+ "pillow",
73
+ "screen door, screen",
74
+ "stairway, staircase",
75
+ "river",
76
+ "bridge, span",
77
+ "bookcase",
78
+ "blind, screen",
79
+ "coffee table",
80
+ "toilet, can, commode, crapper, pot, potty, stool, throne",
81
+ "flower",
82
+ "book",
83
+ "hill",
84
+ "bench",
85
+ "countertop",
86
+ "stove",
87
+ "palm, palm tree",
88
+ "kitchen island",
89
+ "computer",
90
+ "swivel chair",
91
+ "boat",
92
+ "bar",
93
+ "arcade machine",
94
+ "hovel, hut, hutch, shack, shanty",
95
+ "bus",
96
+ "towel",
97
+ "light",
98
+ "truck",
99
+ "tower",
100
+ "chandelier",
101
+ "awning, sunshade, sunblind",
102
+ "street lamp",
103
+ "booth",
104
+ "tv",
105
+ "plane",
106
+ "dirt track",
107
+ "clothes",
108
+ "pole",
109
+ "land, ground, soil",
110
+ "bannister, banister, balustrade, balusters, handrail",
111
+ "escalator, moving staircase, moving stairway",
112
+ "ottoman, pouf, pouffe, puff, hassock",
113
+ "bottle",
114
+ "buffet, counter, sideboard",
115
+ "poster, posting, placard, notice, bill, card",
116
+ "stage",
117
+ "van",
118
+ "ship",
119
+ "fountain",
120
+ "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
121
+ "canopy",
122
+ "washer, automatic washer, washing machine",
123
+ "plaything, toy",
124
+ "pool",
125
+ "stool",
126
+ "barrel, cask",
127
+ "basket, handbasket",
128
+ "falls",
129
+ "tent",
130
+ "bag",
131
+ "minibike, motorbike",
132
+ "cradle",
133
+ "oven",
134
+ "ball",
135
+ "food, solid food",
136
+ "step, stair",
137
+ "tank, storage tank",
138
+ "trade name",
139
+ "microwave",
140
+ "pot",
141
+ "animal",
142
+ "bicycle",
143
+ "lake",
144
+ "dishwasher",
145
+ "screen",
146
+ "blanket, cover",
147
+ "sculpture",
148
+ "hood, exhaust hood",
149
+ "sconce",
150
+ "vase",
151
+ "traffic light",
152
+ "tray",
153
+ "trash can",
154
+ "fan",
155
+ "pier",
156
+ "crt screen",
157
+ "plate",
158
+ "monitor",
159
+ "bulletin board",
160
+ "shower",
161
+ "radiator",
162
+ "glass, drinking glass",
163
+ "clock",
164
+ "flag", # noqa
165
+ ]
166
+
167
+ PALETTE = [
168
+ [120, 120, 120],
169
+ [180, 120, 120],
170
+ [6, 230, 230],
171
+ [80, 50, 50],
172
+ [4, 200, 3],
173
+ [120, 120, 80],
174
+ [140, 140, 140],
175
+ [204, 5, 255],
176
+ [230, 230, 230],
177
+ [4, 250, 7],
178
+ [224, 5, 255],
179
+ [235, 255, 7],
180
+ [150, 5, 61],
181
+ [120, 120, 70],
182
+ [8, 255, 51],
183
+ [255, 6, 82],
184
+ [143, 255, 140],
185
+ [204, 255, 4],
186
+ [255, 51, 7],
187
+ [204, 70, 3],
188
+ [0, 102, 200],
189
+ [61, 230, 250],
190
+ [255, 6, 51],
191
+ [11, 102, 255],
192
+ [255, 7, 71],
193
+ [255, 9, 224],
194
+ [9, 7, 230],
195
+ [220, 220, 220],
196
+ [255, 9, 92],
197
+ [112, 9, 255],
198
+ [8, 255, 214],
199
+ [7, 255, 224],
200
+ [255, 184, 6],
201
+ [10, 255, 71],
202
+ [255, 41, 10],
203
+ [7, 255, 255],
204
+ [224, 255, 8],
205
+ [102, 8, 255],
206
+ [255, 61, 6],
207
+ [255, 194, 7],
208
+ [255, 122, 8],
209
+ [0, 255, 20],
210
+ [255, 8, 41],
211
+ [255, 5, 153],
212
+ [6, 51, 255],
213
+ [235, 12, 255],
214
+ [160, 150, 20],
215
+ [0, 163, 255],
216
+ [140, 140, 200],
217
+ [250, 10, 15],
218
+ [20, 255, 0],
219
+ [31, 255, 0],
220
+ [255, 31, 0],
221
+ [255, 224, 0],
222
+ [153, 255, 0],
223
+ [0, 0, 255],
224
+ [255, 71, 0],
225
+ [0, 235, 255],
226
+ [0, 173, 255],
227
+ [31, 0, 255],
228
+ [11, 200, 200],
229
+ [255, 82, 0],
230
+ [0, 255, 245],
231
+ [0, 61, 255],
232
+ [0, 255, 112],
233
+ [0, 255, 133],
234
+ [255, 0, 0],
235
+ [255, 163, 0],
236
+ [255, 102, 0],
237
+ [194, 255, 0],
238
+ [0, 143, 255],
239
+ [51, 255, 0],
240
+ [0, 82, 255],
241
+ [0, 255, 41],
242
+ [0, 255, 173],
243
+ [10, 0, 255],
244
+ [173, 255, 0],
245
+ [0, 255, 153],
246
+ [255, 92, 0],
247
+ [255, 0, 255],
248
+ [255, 0, 245],
249
+ [255, 0, 102],
250
+ [255, 173, 0],
251
+ [255, 0, 20],
252
+ [255, 184, 184],
253
+ [0, 31, 255],
254
+ [0, 255, 61],
255
+ [0, 71, 255],
256
+ [255, 0, 204],
257
+ [0, 255, 194],
258
+ [0, 255, 82],
259
+ [0, 10, 255],
260
+ [0, 112, 255],
261
+ [51, 0, 255],
262
+ [0, 194, 255],
263
+ [0, 122, 255],
264
+ [0, 255, 163],
265
+ [255, 153, 0],
266
+ [0, 255, 10],
267
+ [255, 112, 0],
268
+ [143, 255, 0],
269
+ [82, 0, 255],
270
+ [163, 255, 0],
271
+ [255, 235, 0],
272
+ [8, 184, 170],
273
+ [133, 0, 255],
274
+ [0, 255, 92],
275
+ [184, 0, 255],
276
+ [255, 0, 31],
277
+ [0, 184, 255],
278
+ [0, 214, 255],
279
+ [255, 0, 112],
280
+ [92, 255, 0],
281
+ [0, 224, 255],
282
+ [112, 224, 255],
283
+ [70, 184, 160],
284
+ [163, 0, 255],
285
+ [153, 0, 255],
286
+ [71, 255, 0],
287
+ [255, 0, 163],
288
+ [255, 204, 0],
289
+ [255, 0, 143],
290
+ [0, 255, 235],
291
+ [133, 255, 0],
292
+ [255, 0, 235],
293
+ [245, 0, 255],
294
+ [255, 0, 122],
295
+ [255, 245, 0],
296
+ [10, 190, 212],
297
+ [214, 255, 0],
298
+ [0, 204, 255],
299
+ [20, 0, 255],
300
+ [255, 255, 0],
301
+ [0, 153, 255],
302
+ [0, 41, 255],
303
+ [0, 255, 204],
304
+ [41, 0, 255],
305
+ [41, 255, 0],
306
+ [173, 0, 255],
307
+ [0, 245, 255],
308
+ [71, 0, 255],
309
+ [122, 0, 255],
310
+ [0, 255, 184],
311
+ [0, 92, 255],
312
+ [184, 255, 0],
313
+ [0, 133, 255],
314
+ [255, 214, 0],
315
+ [25, 194, 194],
316
+ [102, 255, 0],
317
+ [92, 0, 255],
318
+ ]
319
+
320
+
321
+ if __name__ == "__main__":
322
+ dataset_dir = os.getenv("DETECTRON2_DATASETS", "datasets")
323
+
324
+ for name, dirname in [("train", "training"), ("val", "validation")]:
325
+ image_dir = os.path.join(dataset_dir, f"ADEChallengeData2016/images/{dirname}/")
326
+ semantic_dir = os.path.join(dataset_dir, f"ADEChallengeData2016/annotations/{dirname}/")
327
+ instance_dir = os.path.join(
328
+ dataset_dir, f"ADEChallengeData2016/annotations_instance/{dirname}/"
329
+ )
330
+
331
+ # folder to store panoptic PNGs
332
+ out_folder = os.path.join(dataset_dir, f"ADEChallengeData2016/ade20k_panoptic_{name}/")
333
+ # json with segmentations information
334
+ out_file = os.path.join(dataset_dir, f"ADEChallengeData2016/ade20k_panoptic_{name}.json")
335
+
336
+ if not os.path.isdir(out_folder):
337
+ print("Creating folder {} for panoptic segmentation PNGs".format(out_folder))
338
+ os.mkdir(out_folder)
339
+
340
+ # json config
341
+ config_file = "datasets/ade20k_instance_imgCatIds.json"
342
+ with open(config_file) as f:
343
+ config = json.load(f)
344
+
345
+ # load catid mapping
346
+ mapping_file = "datasets/ade20k_instance_catid_mapping.txt"
347
+ with open(mapping_file) as f:
348
+ map_id = {}
349
+ for i, line in enumerate(f.readlines()):
350
+ if i == 0:
351
+ continue
352
+ ins_id, sem_id, _ = line.strip().split()
353
+ # shift id by 1 because we want it to start from 0!
354
+ # ignore_label becomes 255
355
+ map_id[int(ins_id) - 1] = int(sem_id) - 1
356
+
357
+ ADE20K_150_CATEGORIES = []
358
+ for cat_id, cat_name in enumerate(ADE20K_SEM_SEG_CATEGORIES):
359
+ ADE20K_150_CATEGORIES.append(
360
+ {
361
+ "name": cat_name,
362
+ "id": cat_id,
363
+ "isthing": int(cat_id in map_id.values()),
364
+ "color": PALETTE[cat_id],
365
+ }
366
+ )
367
+ categories_dict = {cat["id"]: cat for cat in ADE20K_150_CATEGORIES}
368
+
369
+ panoptic_json_categories = ADE20K_150_CATEGORIES[:]
370
+ panoptic_json_images = []
371
+ panoptic_json_annotations = []
372
+
373
+ filenames = sorted(glob.glob(os.path.join(image_dir, "*.jpg")))
374
+ for idx, filename in enumerate(tqdm.tqdm(filenames)):
375
+ panoptic_json_image = {}
376
+ panoptic_json_annotation = {}
377
+
378
+ image_id = os.path.basename(filename).split(".")[0]
379
+
380
+ panoptic_json_image["id"] = image_id
381
+ panoptic_json_image["file_name"] = os.path.basename(filename)
382
+
383
+ original_format = np.array(Image.open(filename))
384
+ panoptic_json_image["width"] = original_format.shape[1]
385
+ panoptic_json_image["height"] = original_format.shape[0]
386
+
387
+ pan_seg = np.zeros(
388
+ (original_format.shape[0], original_format.shape[1], 3), dtype=np.uint8
389
+ )
390
+ id_generator = IdGenerator(categories_dict)
391
+
392
+ filename_semantic = os.path.join(semantic_dir, image_id + ".png")
393
+ filename_instance = os.path.join(instance_dir, image_id + ".png")
394
+
395
+ sem_seg = np.asarray(Image.open(filename_semantic))
396
+ ins_seg = np.asarray(Image.open(filename_instance))
397
+
398
+ assert sem_seg.dtype == np.uint8
399
+ assert ins_seg.dtype == np.uint8
400
+
401
+ semantic_cat_ids = sem_seg - 1
402
+ instance_cat_ids = ins_seg[..., 0] - 1
403
+ # instance id starts from 1!
404
+ # because 0 is reserved as VOID label
405
+ instance_ins_ids = ins_seg[..., 1]
406
+
407
+ segm_info = []
408
+
409
+ # NOTE: there is some overlap between semantic and instance annotation
410
+ # thus we paste stuffs first
411
+
412
+ # process stuffs
413
+ for semantic_cat_id in np.unique(semantic_cat_ids):
414
+ if semantic_cat_id == 255:
415
+ continue
416
+ if categories_dict[semantic_cat_id]["isthing"]:
417
+ continue
418
+ mask = semantic_cat_ids == semantic_cat_id
419
+ # should not have any overlap
420
+ assert pan_seg[mask].sum() == 0
421
+
422
+ segment_id, color = id_generator.get_id_and_color(semantic_cat_id)
423
+ pan_seg[mask] = color
424
+
425
+ area = np.sum(mask) # segment area computation
426
+ # bbox computation for a segment
427
+ hor = np.sum(mask, axis=0)
428
+ hor_idx = np.nonzero(hor)[0]
429
+ x = hor_idx[0]
430
+ width = hor_idx[-1] - x + 1
431
+ vert = np.sum(mask, axis=1)
432
+ vert_idx = np.nonzero(vert)[0]
433
+ y = vert_idx[0]
434
+ height = vert_idx[-1] - y + 1
435
+ bbox = [int(x), int(y), int(width), int(height)]
436
+
437
+ segm_info.append(
438
+ {
439
+ "id": int(segment_id),
440
+ "category_id": int(semantic_cat_id),
441
+ "area": int(area),
442
+ "bbox": bbox,
443
+ "iscrowd": 0,
444
+ }
445
+ )
446
+
447
+ # process things
448
+ for thing_id in np.unique(instance_ins_ids):
449
+ if thing_id == 0:
450
+ continue
451
+ mask = instance_ins_ids == thing_id
452
+ instance_cat_id = np.unique(instance_cat_ids[mask])
453
+ assert len(instance_cat_id) == 1
454
+
455
+ semantic_cat_id = map_id[instance_cat_id[0]]
456
+
457
+ segment_id, color = id_generator.get_id_and_color(semantic_cat_id)
458
+ pan_seg[mask] = color
459
+
460
+ area = np.sum(mask) # segment area computation
461
+ # bbox computation for a segment
462
+ hor = np.sum(mask, axis=0)
463
+ hor_idx = np.nonzero(hor)[0]
464
+ x = hor_idx[0]
465
+ width = hor_idx[-1] - x + 1
466
+ vert = np.sum(mask, axis=1)
467
+ vert_idx = np.nonzero(vert)[0]
468
+ y = vert_idx[0]
469
+ height = vert_idx[-1] - y + 1
470
+ bbox = [int(x), int(y), int(width), int(height)]
471
+
472
+ segm_info.append(
473
+ {
474
+ "id": int(segment_id),
475
+ "category_id": int(semantic_cat_id),
476
+ "area": int(area),
477
+ "bbox": bbox,
478
+ "iscrowd": 0,
479
+ }
480
+ )
481
+
482
+ panoptic_json_annotation = {
483
+ "image_id": image_id,
484
+ "file_name": image_id + ".png",
485
+ "segments_info": segm_info,
486
+ }
487
+
488
+ Image.fromarray(pan_seg).save(os.path.join(out_folder, image_id + ".png"))
489
+
490
+ panoptic_json_images.append(panoptic_json_image)
491
+ panoptic_json_annotations.append(panoptic_json_annotation)
492
+
493
+ # save this
494
+ d = {
495
+ "images": panoptic_json_images,
496
+ "annotations": panoptic_json_annotations,
497
+ "categories": panoptic_json_categories,
498
+ }
499
+
500
+ save_json(d, out_file)
annotator/OneFormer/datasets/prepare_ade20k_sem_seg.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # Copyright (c) Facebook, Inc. and its affiliates.
4
+ import os
5
+ from pathlib import Path
6
+
7
+ import numpy as np
8
+ import tqdm
9
+ from PIL import Image
10
+
11
+
12
+ def convert(input, output):
13
+ img = np.asarray(Image.open(input))
14
+ assert img.dtype == np.uint8
15
+ img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1
16
+ Image.fromarray(img).save(output)
17
+
18
+
19
+ if __name__ == "__main__":
20
+ dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
21
+ for name in ["training", "validation"]:
22
+ annotation_dir = dataset_dir / "annotations" / name
23
+ output_dir = dataset_dir / "annotations_detectron2" / name
24
+ output_dir.mkdir(parents=True, exist_ok=True)
25
+ for file in tqdm.tqdm(list(annotation_dir.iterdir())):
26
+ output_file = output_dir / file.name
27
+ convert(file, output_file)
annotator/OneFormer/datasets/prepare_coco_semantic_annos_from_panoptic_annos.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ # Copyright (c) Facebook, Inc. and its affiliates.
4
+
5
+ import functools
6
+ import json
7
+ import multiprocessing as mp
8
+ import numpy as np
9
+ import os
10
+ import time
11
+ from fvcore.common.download import download
12
+ from panopticapi.utils import rgb2id
13
+ from PIL import Image
14
+
15
+ from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
16
+
17
+
18
+ def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map):
19
+ panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32)
20
+ panoptic = rgb2id(panoptic)
21
+ output = np.zeros_like(panoptic, dtype=np.uint8) + 255
22
+ for seg in segments:
23
+ cat_id = seg["category_id"]
24
+ new_cat_id = id_map[cat_id]
25
+ output[panoptic == seg["id"]] = new_cat_id
26
+ Image.fromarray(output).save(output_semantic)
27
+
28
+
29
+ def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories):
30
+ """
31
+ Create semantic segmentation annotations from panoptic segmentation
32
+ annotations, to be used by PanopticFPN.
33
+ It maps all thing categories to class 0, and maps all unlabeled pixels to class 255.
34
+ It maps all stuff categories to contiguous ids starting from 1.
35
+ Args:
36
+ panoptic_json (str): path to the panoptic json file, in COCO's format.
37
+ panoptic_root (str): a directory with panoptic annotation files, in COCO's format.
38
+ sem_seg_root (str): a directory to output semantic annotation files
39
+ categories (list[dict]): category metadata. Each dict needs to have:
40
+ "id": corresponds to the "category_id" in the json annotations
41
+ "isthing": 0 or 1
42
+ """
43
+ os.makedirs(sem_seg_root, exist_ok=True)
44
+
45
+ id_map = {} # map from category id to id in the output semantic annotation
46
+ assert len(categories) <= 254
47
+ for i, k in enumerate(categories):
48
+ id_map[k["id"]] = i
49
+ # what is id = 0?
50
+ # id_map[0] = 255
51
+ print(id_map)
52
+
53
+ with open(panoptic_json) as f:
54
+ obj = json.load(f)
55
+
56
+ pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
57
+
58
+ def iter_annotations():
59
+ for anno in obj["annotations"]:
60
+ file_name = anno["file_name"]
61
+ segments = anno["segments_info"]
62
+ input = os.path.join(panoptic_root, file_name)
63
+ output = os.path.join(sem_seg_root, file_name)
64
+ yield input, output, segments
65
+
66
+ print("Start writing to {} ...".format(sem_seg_root))
67
+ start = time.time()
68
+ pool.starmap(
69
+ functools.partial(_process_panoptic_to_semantic, id_map=id_map),
70
+ iter_annotations(),
71
+ chunksize=100,
72
+ )
73
+ print("Finished. time: {:.2f}s".format(time.time() - start))
74
+
75
+
76
+ if __name__ == "__main__":
77
+ dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
78
+ for s in ["val2017", "train2017"]:
79
+ separate_coco_semantic_from_panoptic(
80
+ os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)),
81
+ os.path.join(dataset_dir, "panoptic_{}".format(s)),
82
+ os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)),
83
+ COCO_CATEGORIES,
84
+ )
annotator/OneFormer/demo/defaults.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ------------------------------------------------------------------------------
2
+ # Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/defaults.py
3
+ # Modified by Jitesh Jain (https://github.com/praeclarumjj3)
4
+ # ------------------------------------------------------------------------------
5
+
6
+ import torch
7
+ import detectron2.data.transforms as T
8
+ from detectron2.checkpoint import DetectionCheckpointer
9
+ from detectron2.data import (
10
+ MetadataCatalog,
11
+ )
12
+ from detectron2.modeling import build_model
13
+
14
+
15
+ __all__ = [
16
+ "DefaultPredictor",
17
+ ]
18
+
19
+
20
+ class DefaultPredictor:
21
+ """
22
+ Create a simple end-to-end predictor with the given config that runs on
23
+ single device for a single input image.
24
+ Compared to using the model directly, this class does the following additions:
25
+ 1. Load checkpoint from `cfg.MODEL.WEIGHTS`.
26
+ 2. Always take BGR image as the input and apply conversion defined by `cfg.INPUT.FORMAT`.
27
+ 3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`.
28
+ 4. Take one input image and produce a single output, instead of a batch.
29
+ This is meant for simple demo purposes, so it does the above steps automatically.
30
+ This is not meant for benchmarks or running complicated inference logic.
31
+ If you'd like to do anything more complicated, please refer to its source code as
32
+ examples to build and use the model manually.
33
+ Attributes:
34
+ metadata (Metadata): the metadata of the underlying dataset, obtained from
35
+ cfg.DATASETS.TEST.
36
+ Examples:
37
+ ::
38
+ pred = DefaultPredictor(cfg)
39
+ inputs = cv2.imread("input.jpg")
40
+ outputs = pred(inputs)
41
+ """
42
+
43
+ def __init__(self, cfg):
44
+ self.cfg = cfg.clone() # cfg can be modified by model
45
+ self.model = build_model(self.cfg)
46
+ self.model.eval()
47
+ if len(cfg.DATASETS.TEST):
48
+ self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
49
+
50
+ checkpointer = DetectionCheckpointer(self.model)
51
+ checkpointer.load(cfg.MODEL.WEIGHTS)
52
+
53
+ self.aug = T.ResizeShortestEdge(
54
+ [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
55
+ )
56
+
57
+ self.input_format = cfg.INPUT.FORMAT
58
+ assert self.input_format in ["RGB", "BGR"], self.input_format
59
+
60
+ def __call__(self, original_image, task):
61
+ """
62
+ Args:
63
+ original_image (np.ndarray): an image of shape (H, W, C) (in BGR order).
64
+ Returns:
65
+ predictions (dict):
66
+ the output of the model for one image only.
67
+ See :doc:`/tutorials/models` for details about the format.
68
+ """
69
+ with torch.no_grad(): # https://github.com/sphinx-doc/sphinx/issues/4258
70
+ # Apply pre-processing to image.
71
+ if self.input_format == "RGB":
72
+ # whether the model expects BGR inputs or RGB
73
+ original_image = original_image[:, :, ::-1]
74
+ height, width = original_image.shape[:2]
75
+ image = self.aug.get_transform(original_image).apply_image(original_image)
76
+ image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
77
+
78
+ task = f"The task is {task}"
79
+
80
+ inputs = {"image": image, "height": height, "width": width, "task": task}
81
+ predictions = self.model([inputs])[0]
82
+ return predictions
annotator/OneFormer/oneformer/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from . import data # register all new datasets
2
+ from . import modeling
3
+
4
+ # config
5
+ from .config import *
6
+
7
+ # dataset loading
8
+ from .data.dataset_mappers.coco_unified_new_baseline_dataset_mapper import COCOUnifiedNewBaselineDatasetMapper
9
+ from .data.dataset_mappers.oneformer_unified_dataset_mapper import (
10
+ OneFormerUnifiedDatasetMapper,
11
+ )
12
+
13
+ # models
14
+ from .oneformer_model import OneFormer
15
+ from .test_time_augmentation import SemanticSegmentorWithTTA
16
+
17
+ # evaluation
18
+ from .evaluation.instance_evaluation import InstanceSegEvaluator
annotator/OneFormer/oneformer/config.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ from detectron2.config import CfgNode as CN
4
+
5
+ __all__ = ["add_common_config", "add_oneformer_config", "add_swin_config",
6
+ "add_dinat_config", "add_convnext_config"]
7
+
8
+ def add_common_config(cfg):
9
+ """
10
+ Add config for common configuration
11
+ """
12
+
13
+ # data config
14
+ # select the dataset mapper
15
+ cfg.INPUT.DATASET_MAPPER_NAME = "oneformer_unified"
16
+ # Color augmentation
17
+ cfg.INPUT.COLOR_AUG_SSD = False
18
+ # We retry random cropping until no single category in semantic segmentation GT occupies more
19
+ # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
20
+ cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
21
+ # Pad image and segmentation GT in dataset mapper.
22
+ cfg.INPUT.SIZE_DIVISIBILITY = -1
23
+
24
+ cfg.INPUT.TASK_SEQ_LEN = 77
25
+ cfg.INPUT.MAX_SEQ_LEN = 77
26
+
27
+ cfg.INPUT.TASK_PROB = CN()
28
+ cfg.INPUT.TASK_PROB.SEMANTIC = 0.33
29
+ cfg.INPUT.TASK_PROB.INSTANCE = 0.66
30
+
31
+ # test dataset
32
+ cfg.DATASETS.TEST_PANOPTIC = ("",)
33
+ cfg.DATASETS.TEST_INSTANCE = ("",)
34
+ cfg.DATASETS.TEST_SEMANTIC = ("",)
35
+
36
+ # solver config
37
+ # weight decay on embedding
38
+ cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
39
+ # optimizer
40
+ cfg.SOLVER.OPTIMIZER = "ADAMW"
41
+ cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
42
+
43
+ # wandb
44
+ cfg.WANDB = CN()
45
+ cfg.WANDB.PROJECT = "OneFormer"
46
+ cfg.WANDB.NAME = None
47
+
48
+ cfg.MODEL.IS_TRAIN = True
49
+ cfg.MODEL.IS_DEMO = False
50
+
51
+ # text encoder config
52
+ cfg.MODEL.TEXT_ENCODER = CN()
53
+
54
+ cfg.MODEL.TEXT_ENCODER.WIDTH = 256
55
+ cfg.MODEL.TEXT_ENCODER.CONTEXT_LENGTH = 77
56
+ cfg.MODEL.TEXT_ENCODER.NUM_LAYERS = 12
57
+ cfg.MODEL.TEXT_ENCODER.VOCAB_SIZE = 49408
58
+ cfg.MODEL.TEXT_ENCODER.PROJ_NUM_LAYERS = 2
59
+ cfg.MODEL.TEXT_ENCODER.N_CTX = 16
60
+
61
+ # oneformer inference config
62
+ cfg.MODEL.TEST = CN()
63
+ cfg.MODEL.TEST.SEMANTIC_ON = True
64
+ cfg.MODEL.TEST.INSTANCE_ON = False
65
+ cfg.MODEL.TEST.PANOPTIC_ON = False
66
+ cfg.MODEL.TEST.DETECTION_ON = False
67
+ cfg.MODEL.TEST.OBJECT_MASK_THRESHOLD = 0.0
68
+ cfg.MODEL.TEST.OVERLAP_THRESHOLD = 0.0
69
+ cfg.MODEL.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
70
+ cfg.MODEL.TEST.TASK = "panoptic"
71
+
72
+ # TEST AUG Slide
73
+ cfg.TEST.AUG.IS_SLIDE = False
74
+ cfg.TEST.AUG.CROP_SIZE = (640, 640)
75
+ cfg.TEST.AUG.STRIDE = (426, 426)
76
+ cfg.TEST.AUG.SCALE = (2048, 640)
77
+ cfg.TEST.AUG.SETR_MULTI_SCALE = True
78
+ cfg.TEST.AUG.KEEP_RATIO = True
79
+ cfg.TEST.AUG.SIZE_DIVISOR = 32
80
+
81
+ # pixel decoder config
82
+ cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
83
+ # adding transformer in pixel decoder
84
+ cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
85
+ # pixel decoder
86
+ cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
87
+ cfg.MODEL.SEM_SEG_HEAD.SEM_EMBED_DIM = 256
88
+ cfg.MODEL.SEM_SEG_HEAD.INST_EMBED_DIM = 256
89
+
90
+ # LSJ aug
91
+ cfg.INPUT.IMAGE_SIZE = 1024
92
+ cfg.INPUT.MIN_SCALE = 0.1
93
+ cfg.INPUT.MAX_SCALE = 2.0
94
+
95
+ # MSDeformAttn encoder configs
96
+ cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
97
+ cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
98
+ cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8
99
+
100
+ def add_oneformer_config(cfg):
101
+ """
102
+ Add config for ONE_FORMER.
103
+ """
104
+
105
+ # oneformer model config
106
+ cfg.MODEL.ONE_FORMER = CN()
107
+
108
+ # loss
109
+ cfg.MODEL.ONE_FORMER.DEEP_SUPERVISION = True
110
+ cfg.MODEL.ONE_FORMER.NO_OBJECT_WEIGHT = 0.1
111
+ cfg.MODEL.ONE_FORMER.CLASS_WEIGHT = 1.0
112
+ cfg.MODEL.ONE_FORMER.DICE_WEIGHT = 1.0
113
+ cfg.MODEL.ONE_FORMER.MASK_WEIGHT = 20.0
114
+ cfg.MODEL.ONE_FORMER.CONTRASTIVE_WEIGHT = 0.5
115
+ cfg.MODEL.ONE_FORMER.CONTRASTIVE_TEMPERATURE = 0.07
116
+
117
+ # transformer config
118
+ cfg.MODEL.ONE_FORMER.NHEADS = 8
119
+ cfg.MODEL.ONE_FORMER.DROPOUT = 0.1
120
+ cfg.MODEL.ONE_FORMER.DIM_FEEDFORWARD = 2048
121
+ cfg.MODEL.ONE_FORMER.ENC_LAYERS = 0
122
+ cfg.MODEL.ONE_FORMER.CLASS_DEC_LAYERS = 2
123
+ cfg.MODEL.ONE_FORMER.DEC_LAYERS = 6
124
+ cfg.MODEL.ONE_FORMER.PRE_NORM = False
125
+
126
+ cfg.MODEL.ONE_FORMER.HIDDEN_DIM = 256
127
+ cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES = 120
128
+ cfg.MODEL.ONE_FORMER.NUM_OBJECT_CTX = 16
129
+ cfg.MODEL.ONE_FORMER.USE_TASK_NORM = True
130
+
131
+ cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE = "res5"
132
+ cfg.MODEL.ONE_FORMER.ENFORCE_INPUT_PROJ = False
133
+
134
+ # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
135
+ # you can use this config to override
136
+ cfg.MODEL.ONE_FORMER.SIZE_DIVISIBILITY = 32
137
+
138
+ # transformer module
139
+ cfg.MODEL.ONE_FORMER.TRANSFORMER_DECODER_NAME = "ContrastiveMultiScaleMaskedTransformerDecoder"
140
+
141
+ # point loss configs
142
+ # Number of points sampled during training for a mask point head.
143
+ cfg.MODEL.ONE_FORMER.TRAIN_NUM_POINTS = 112 * 112
144
+ # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
145
+ # original paper.
146
+ cfg.MODEL.ONE_FORMER.OVERSAMPLE_RATIO = 3.0
147
+ # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
148
+ # the original paper.
149
+ cfg.MODEL.ONE_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75
150
+
151
+ def add_swin_config(cfg):
152
+ """
153
+ Add config forSWIN Backbone.
154
+ """
155
+
156
+ # swin transformer backbone
157
+ cfg.MODEL.SWIN = CN()
158
+ cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
159
+ cfg.MODEL.SWIN.PATCH_SIZE = 4
160
+ cfg.MODEL.SWIN.EMBED_DIM = 96
161
+ cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
162
+ cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
163
+ cfg.MODEL.SWIN.WINDOW_SIZE = 7
164
+ cfg.MODEL.SWIN.MLP_RATIO = 4.0
165
+ cfg.MODEL.SWIN.QKV_BIAS = True
166
+ cfg.MODEL.SWIN.QK_SCALE = None
167
+ cfg.MODEL.SWIN.DROP_RATE = 0.0
168
+ cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
169
+ cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
170
+ cfg.MODEL.SWIN.APE = False
171
+ cfg.MODEL.SWIN.PATCH_NORM = True
172
+ cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
173
+ cfg.MODEL.SWIN.USE_CHECKPOINT = False
174
+
175
+ def add_dinat_config(cfg):
176
+ """
177
+ Add config for NAT Backbone.
178
+ """
179
+
180
+ # DINAT transformer backbone
181
+ cfg.MODEL.DiNAT = CN()
182
+ cfg.MODEL.DiNAT.DEPTHS = [3, 4, 18, 5]
183
+ cfg.MODEL.DiNAT.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
184
+ cfg.MODEL.DiNAT.EMBED_DIM = 64
185
+ cfg.MODEL.DiNAT.MLP_RATIO = 3.0
186
+ cfg.MODEL.DiNAT.NUM_HEADS = [2, 4, 8, 16]
187
+ cfg.MODEL.DiNAT.DROP_PATH_RATE = 0.2
188
+ cfg.MODEL.DiNAT.KERNEL_SIZE = 7
189
+ cfg.MODEL.DiNAT.DILATIONS = [[1, 16, 1], [1, 4, 1, 8], [1, 2, 1, 3, 1, 4], [1, 2, 1, 2, 1]]
190
+ cfg.MODEL.DiNAT.OUT_INDICES = (0, 1, 2, 3)
191
+ cfg.MODEL.DiNAT.QKV_BIAS = True
192
+ cfg.MODEL.DiNAT.QK_SCALE = None
193
+ cfg.MODEL.DiNAT.DROP_RATE = 0
194
+ cfg.MODEL.DiNAT.ATTN_DROP_RATE = 0.
195
+ cfg.MODEL.DiNAT.IN_PATCH_SIZE = 4
196
+
197
+ def add_convnext_config(cfg):
198
+ """
199
+ Add config for ConvNeXt Backbone.
200
+ """
201
+
202
+ # swin transformer backbone
203
+ cfg.MODEL.CONVNEXT = CN()
204
+ cfg.MODEL.CONVNEXT.IN_CHANNELS = 3
205
+ cfg.MODEL.CONVNEXT.DEPTHS = [3, 3, 27, 3]
206
+ cfg.MODEL.CONVNEXT.DIMS = [192, 384, 768, 1536]
207
+ cfg.MODEL.CONVNEXT.DROP_PATH_RATE = 0.4
208
+ cfg.MODEL.CONVNEXT.LSIT = 1.0
209
+ cfg.MODEL.CONVNEXT.OUT_INDICES = [0, 1, 2, 3]
210
+ cfg.MODEL.CONVNEXT.OUT_FEATURES = ["res2", "res3", "res4", "res5"]