Spaces:

PAIR
/

PAIR-Diffusion

Runtime error

App Files Files Community

vidit98 commited on Apr 7, 2023

Commit

2171e8f

0 Parent(s):

demo files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +34 -0
.gitignore +45 -0
Dockerfile +68 -0
README.md +10 -0
annotator/OneFormer/__init__.py +61 -0
annotator/OneFormer/configs/ade20k/Base-ADE20K-UnifiedSegmentation.yaml +68 -0
annotator/OneFormer/configs/ade20k/convnext/oneformer_convnext_large_bs16_160k.yaml +38 -0
annotator/OneFormer/configs/ade20k/convnext/oneformer_convnext_xlarge_bs16_160k.yaml +38 -0
annotator/OneFormer/configs/ade20k/dinat/coco_pretrain_oneformer_dinat_large_bs16_160k_1280x1280.yaml +42 -0
annotator/OneFormer/configs/ade20k/dinat/oneformer_dinat_large_bs16_160k.yaml +42 -0
annotator/OneFormer/configs/ade20k/dinat/oneformer_dinat_large_bs16_160k_1280x1280.yaml +42 -0
annotator/OneFormer/configs/ade20k/dinat/oneformer_dinat_large_bs16_160k_896x896.yaml +42 -0
annotator/OneFormer/configs/ade20k/oneformer_R50_bs16_160k.yaml +58 -0
annotator/OneFormer/configs/ade20k/swin/oneformer_swin_large_bs16_160k.yaml +40 -0
annotator/OneFormer/configs/ade20k/swin/oneformer_swin_large_bs16_160k_1280x1280.yaml +40 -0
annotator/OneFormer/configs/ade20k/swin/oneformer_swin_large_bs16_160k_896x896.yaml +40 -0
annotator/OneFormer/configs/ade20k/swin/oneformer_swin_tiny_bs16_160k.yaml +15 -0
annotator/OneFormer/configs/cityscapes/Base-Cityscapes-UnifiedSegmentation.yaml +68 -0
annotator/OneFormer/configs/cityscapes/convnext/mapillary_pretrain_oneformer_convnext_large_bs16_90k.yaml +18 -0
annotator/OneFormer/configs/cityscapes/convnext/oneformer_convnext_large_bs16_90k.yaml +18 -0
annotator/OneFormer/configs/cityscapes/convnext/oneformer_convnext_xlarge_bs16_90k.yaml +18 -0
annotator/OneFormer/configs/cityscapes/dinat/oneformer_dinat_large_bs16_90k.yaml +22 -0
annotator/OneFormer/configs/cityscapes/oneformer_R50_bs16_90k.yaml +59 -0
annotator/OneFormer/configs/cityscapes/swin/oneformer_swin_large_bs16_90k.yaml +20 -0
annotator/OneFormer/configs/coco/Base-COCO-UnifiedSegmentation.yaml +54 -0
annotator/OneFormer/configs/coco/dinat/oneformer_dinat_large_bs16_100ep.yaml +24 -0
annotator/OneFormer/configs/coco/oneformer_R50_bs16_50ep.yaml +59 -0
annotator/OneFormer/configs/coco/swin/oneformer_swin_large_bs16_100ep.yaml +28 -0
annotator/OneFormer/configs/coco/swin/oneformer_swin_tiny_bs16_50ep.yaml +15 -0
annotator/OneFormer/configs/mapillary_vistas/Base-Mapillary-UnifiedSegmentation.yaml +68 -0
annotator/OneFormer/configs/mapillary_vistas/convnext/cityscapes_pretrain_oneformer_convnext_large_bs16_300k.yaml +22 -0
annotator/OneFormer/configs/mapillary_vistas/convnext/oneformer_convnext_large_bs16_300k.yaml +18 -0
annotator/OneFormer/configs/mapillary_vistas/dinat/oneformer_dinat_large_bs16_300k.yaml +22 -0
annotator/OneFormer/configs/mapillary_vistas/oneformer_R50_bs16_300k.yaml +59 -0
annotator/OneFormer/configs/mapillary_vistas/swin/oneformer_swin_large_bs16_300k.yaml +20 -0
annotator/OneFormer/datasets/README.md +168 -0
annotator/OneFormer/datasets/ade20k_instance_catid_mapping.txt +104 -0
annotator/OneFormer/datasets/custom_datasets/README.md +35 -0
annotator/OneFormer/datasets/custom_datasets/instance_coco_custom_dataset_mapper.py +235 -0
annotator/OneFormer/datasets/custom_datasets/instance_oneformer_custom_dataset_mapper.py +245 -0
annotator/OneFormer/datasets/custom_datasets/semantic_oneformer_custom_dataset_mapper.py +238 -0
annotator/OneFormer/datasets/fg_ids.py +108 -0
annotator/OneFormer/datasets/panoptic2detection_coco_format.py +152 -0
annotator/OneFormer/datasets/prepare_ade20k_ins_seg.py +112 -0
annotator/OneFormer/datasets/prepare_ade20k_pan_seg.py +500 -0
annotator/OneFormer/datasets/prepare_ade20k_sem_seg.py +27 -0
annotator/OneFormer/datasets/prepare_coco_semantic_annos_from_panoptic_annos.py +84 -0
annotator/OneFormer/demo/defaults.py +82 -0
annotator/OneFormer/oneformer/__init__.py +18 -0
annotator/OneFormer/oneformer/config.py +210 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,45 @@

+*_video
+*_video.py
+extras/
+# output dir
+.DS_Store
+output
+*.json
+*.diff
+*.jpg
+*.ckpt
+# compilation and distribution
+__pycache__
+_ext
+*.pyc
+*.pyd
+*.so
+*.dll
+*.egg-info/
+build/
+dist/
+wheels/
+# pytorch/python/numpy formats
+*.pth
+*.pkl
+*.npy
+*.ts
+model_ts*.txt
+# ipython/jupyter notebooks
+**/.ipynb_checkpoints/
+# Editor temporaries
+*.swn
+*.swo
+*.swp
+*~
+# editor settings
+.idea
+.vscode
+_darcs

Dockerfile ADDED Viewed

	@@ -0,0 +1,68 @@

+FROM nvidia/cuda:11.3.1-cudnn8-devel-ubuntu18.04
+CMD nvidia-smi
+ENV DEBIAN_FRONTEND noninteractive
+RUN apt-get update && apt-get install -y \
+        git \
+        make build-essential libssl-dev zlib1g-dev \
+        libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm \
+        libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev  \
+    	ffmpeg libsm6 libxext6 cmake libgl1-mesa-glx \
+		&& rm -rf /var/lib/apt/lists/*
+RUN useradd -ms /bin/bash user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+RUN curl https://pyenv.run | bash
+ENV PATH=$HOME/.pyenv/shims:$HOME/.pyenv/bin:$PATH
+RUN pyenv install 3.8.15 && \
+    pyenv global 3.8.15 && \
+    pyenv rehash && \
+    pip install --no-cache-dir --upgrade pip setuptools wheel
+ENV WORKDIR=/code
+WORKDIR $WORKDIR
+RUN chown -R user:user $WORKDIR
+RUN chmod -R 777 $WORKDIR
+COPY requirements.txt $WORKDIR/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r $WORKDIR/requirements.txt
+RUN pip install ninja
+COPY . .
+ARG TORCH_CUDA_ARCH_LIST=7.5+PTX
+USER root
+RUN chown -R user:user $HOME
+RUN chmod -R 777 $HOME
+RUN chown -R user:user $WORKDIR
+RUN chmod -R 777 $WORKDIR
+USER user
+RUN ln -s $WORKDIR/annotator/OneFormer/oneformer/modeling/pixel_decoder/ops $WORKDIR/ && ls
+RUN cd ops/ && FORCE_CUDA=1 python setup.py build --build-base=$WORKDIR/ install --user && cd ..
+RUN sh deform_setup.sh
+USER user
+RUN sh deform_setup.sh
+USER user
+ENV PYTHONPATH=${HOME}/app \
+    PYTHONUNBUFFERED=1 \
+    GRADIO_ALLOW_FLAGGING=never \
+    GRADIO_NUM_PORTS=1 \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_THEME=huggingface \
+    SYSTEM=spaces
+RUN --mount=type=secret,id=ACCESS_TOKEN,mode=0444,required=true
+EXPOSE 7860
+ENTRYPOINT ["python", "app.py"]

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: PAIR Diffusion
+emoji: 📚
+colorFrom: purple
+colorTo: gray
+sdk: docker
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

annotator/OneFormer/__init__.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# ------------------------------------------------------------------------------
+# Reference: https://github.com/SHI-Labs/OneFormer
+# Modified by Vidit Goel (https://github.com/vidit98)
+# ------------------------------------------------------------------------------
+import os
+import random
+# fmt: off
+import sys
+sys.path.insert(1, './annotator/OneFormer')
+# fmt: on
+import imutils
+import cv2
+import numpy as np
+from detectron2.config import get_cfg
+from detectron2.projects.deeplab import add_deeplab_config
+from detectron2.data import MetadataCatalog
+from oneformer import (
+    add_oneformer_config,
+    add_common_config,
+    add_swin_config,
+    add_dinat_config,
+    add_convnext_config,
+)
+from demo.defaults import DefaultPredictor
+def setup_cfg(config_file, wts):
+    # load config from file and command-line arguments
+    cfg = get_cfg()
+    add_deeplab_config(cfg)
+    add_common_config(cfg)
+    add_swin_config(cfg)
+    add_dinat_config(cfg)
+    add_convnext_config(cfg)
+    add_oneformer_config(cfg)
+    cfg.merge_from_file(config_file)
+    cfg.MODEL.WEIGHTS = wts
+    cfg.freeze()
+    return cfg
+class OneformerSegmenter:
+    def __init__(self, wts, config='./annotator/OneFormer/configs/coco/swin/oneformer_swin_large_bs16_100ep.yaml',confidence_thresh=0.5):
+        cfg = setup_cfg(config, wts)
+        metadata = MetadataCatalog.get(cfg.DATASETS.TEST_PANOPTIC[0] if len(cfg.DATASETS.TEST_PANOPTIC) else "__unused")
+        self.predictor = DefaultPredictor(cfg)
+        self.metadata = metadata
+    def __call__(self, img, task):
+        if task == 'panoptic':
+            predictions = self.predictor(img, "panoptic")
+            panoptic_seg, segments_info = predictions["panoptic_seg"]
+            return panoptic_seg, segments_info
+        elif task == 'semantic':
+            predictions = self.predictor(img, "semantic")
+            semask = predictions["sem_seg"].argmax(dim=0)
+            return semask

annotator/OneFormer/configs/ade20k/Base-ADE20K-UnifiedSegmentation.yaml ADDED Viewed

	@@ -0,0 +1,68 @@

+MODEL:
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: "build_resnet_backbone"
+  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  RESNETS:
+    DEPTH: 50
+    STEM_TYPE: "basic"  # not used
+    STEM_OUT_CHANNELS: 64
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+    # NORM: "SyncBN"
+    RES5_MULTI_GRID: [1, 1, 1]  # not used
+DATASETS:
+  TRAIN: ("ade20k_panoptic_train",)
+  TEST_PANOPTIC: ("ade20k_panoptic_val",)
+  TEST_INSTANCE: ("ade20k_instance_val",)
+  TEST_SEMANTIC: ("ade20k_sem_seg_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.0001
+  MAX_ITER: 160000
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 0
+  WEIGHT_DECAY: 0.05
+  OPTIMIZER: "ADAMW"
+  LR_SCHEDULER_NAME: "WarmupPolyLR"
+  BACKBONE_MULTIPLIER: 0.1
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.01
+    NORM_TYPE: 2.0
+  AMP:
+    ENABLED: True
+INPUT:
+  MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 512
+  MAX_SIZE_TRAIN: 2048
+  MAX_SIZE_TEST: 2048
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (512, 512)
+    SINGLE_CATEGORY_MAX_AREA: 1.0
+  COLOR_AUG_SSD: True
+  SIZE_DIVISIBILITY: 512  # used in dataset mapper
+  FORMAT: "RGB"
+  DATASET_MAPPER_NAME: "oneformer_unified"
+  MAX_SEQ_LEN: 77
+  TASK_SEQ_LEN: 77
+  TASK_PROB:
+    SEMANTIC: 0.33
+    INSTANCE: 0.66
+TEST:
+  EVAL_PERIOD: 5000
+  AUG:
+    ENABLED: False
+    MIN_SIZES: [256, 384, 512, 640, 768, 896]
+    MAX_SIZE: 3584
+    FLIP: True
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+  NUM_WORKERS: 4
+VERSION: 2

annotator/OneFormer/configs/ade20k/convnext/oneformer_convnext_large_bs16_160k.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+_BASE_: ../oneformer_R50_bs16_160k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2ConvNeXt"
+  CONVNEXT:
+    IN_CHANNELS: 3
+    DEPTHS: [3, 3, 27, 3]
+    DIMS: [192, 384, 768, 1536]
+    DROP_PATH_RATE: 0.4
+    LSIT: 1.0
+    OUT_INDICES: [0, 1, 2, 3]
+  WEIGHTS: "convnext_large_22k_1k_384.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+INPUT:
+  MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 640
+  MAX_SIZE_TRAIN: 2560
+  MAX_SIZE_TEST: 2560
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (640, 640)
+    SINGLE_CATEGORY_MAX_AREA: 1.0
+  COLOR_AUG_SSD: True
+  SIZE_DIVISIBILITY: 640  # used in dataset mapper
+  FORMAT: "RGB"
+TEST:
+  DETECTIONS_PER_IMAGE: 250
+  EVAL_PERIOD: 5000
+  AUG:
+    ENABLED: False
+    MIN_SIZES: [320, 480, 640, 800, 960, 1120]
+    MAX_SIZE: 4480
+    FLIP: True

annotator/OneFormer/configs/ade20k/convnext/oneformer_convnext_xlarge_bs16_160k.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+_BASE_: ../oneformer_R50_bs16_160k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2ConvNeXt"
+  CONVNEXT:
+    IN_CHANNELS: 3
+    DEPTHS: [3, 3, 27, 3]
+    DIMS: [256, 512, 1024, 2048]
+    DROP_PATH_RATE: 0.4
+    LSIT: 1.0
+    OUT_INDICES: [0, 1, 2, 3]
+  WEIGHTS: "convnext_xlarge_22k_1k_384_ema.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+INPUT:
+  MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 640
+  MAX_SIZE_TRAIN: 2560
+  MAX_SIZE_TEST: 2560
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (640, 640)
+    SINGLE_CATEGORY_MAX_AREA: 1.0
+  COLOR_AUG_SSD: True
+  SIZE_DIVISIBILITY: 640  # used in dataset mapper
+  FORMAT: "RGB"
+TEST:
+  DETECTIONS_PER_IMAGE: 250
+  EVAL_PERIOD: 5000
+  AUG:
+    ENABLED: False
+    MIN_SIZES: [320, 480, 640, 800, 960, 1120]
+    MAX_SIZE: 4480
+    FLIP: True

annotator/OneFormer/configs/ade20k/dinat/coco_pretrain_oneformer_dinat_large_bs16_160k_1280x1280.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+_BASE_: ../oneformer_R50_bs16_160k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2DiNAT"
+  DiNAT:
+    EMBED_DIM: 192
+    MLP_RATIO: 2.0
+    DEPTHS: [3, 4, 18, 5]
+    NUM_HEADS: [6, 12, 24, 48]
+    KERNEL_SIZE: 11
+    DROP_PATH_RATE: 0.3
+    DILATIONS: [[1, 28, 1], [1, 7, 1, 14], [1, 3, 1, 5, 1, 5, 1, 7, 1, 3, 1, 5, 1, 5, 1, 7, 1, 7], [1, 3, 1, 3, 1]]
+  WEIGHTS: "150_16_dinat_l_oneformer_coco_100ep.pth"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 150
+SOLVER:
+  AMP:
+    ENABLED: False
+INPUT:
+  MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1280) for x in range(5, 21)]"]
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 1280
+  MAX_SIZE_TRAIN: 5120
+  MAX_SIZE_TEST: 5120
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (1280, 1280)
+    SINGLE_CATEGORY_MAX_AREA: 1.0
+  COLOR_AUG_SSD: True
+  SIZE_DIVISIBILITY: 1280  # used in dataset mapper
+  FORMAT: "RGB"
+TEST:
+  DETECTIONS_PER_IMAGE: 150
+  EVAL_PERIOD: 5000
+  AUG:
+    ENABLED: False
+    MIN_SIZES: [640, 960, 1280, 1600, 1920, 2240]
+    MAX_SIZE: 8960
+    FLIP: True

annotator/OneFormer/configs/ade20k/dinat/oneformer_dinat_large_bs16_160k.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+_BASE_: ../oneformer_R50_bs16_160k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2DiNAT"
+  DiNAT:
+    EMBED_DIM: 192
+    MLP_RATIO: 2.0
+    DEPTHS: [3, 4, 18, 5]
+    NUM_HEADS: [6, 12, 24, 48]
+    KERNEL_SIZE: 11
+    DROP_PATH_RATE: 0.3
+    DILATIONS: [[1, 20, 1], [1, 5, 1, 10], [1, 2, 1, 3, 1, 4, 1, 5, 1, 2, 1, 3, 1, 4, 1, 5, 1, 5], [1, 2, 1, 2, 1]]
+  WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+SOLVER:
+  AMP:
+    ENABLED: False
+INPUT:
+  MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 640
+  MAX_SIZE_TRAIN: 2560
+  MAX_SIZE_TEST: 2560
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (640, 640)
+    SINGLE_CATEGORY_MAX_AREA: 1.0
+  COLOR_AUG_SSD: True
+  SIZE_DIVISIBILITY: 640  # used in dataset mapper
+  FORMAT: "RGB"
+TEST:
+  DETECTIONS_PER_IMAGE: 250
+  EVAL_PERIOD: 5000
+  AUG:
+    ENABLED: False
+    MIN_SIZES: [320, 480, 640, 800, 960, 1120]
+    MAX_SIZE: 4480
+    FLIP: True

annotator/OneFormer/configs/ade20k/dinat/oneformer_dinat_large_bs16_160k_1280x1280.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+_BASE_: ../oneformer_R50_bs16_160k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2DiNAT"
+  DiNAT:
+    EMBED_DIM: 192
+    MLP_RATIO: 2.0
+    DEPTHS: [3, 4, 18, 5]
+    NUM_HEADS: [6, 12, 24, 48]
+    KERNEL_SIZE: 11
+    DROP_PATH_RATE: 0.3
+    DILATIONS: [[1, 28, 1], [1, 7, 1, 14], [1, 3, 1, 5, 1, 5, 1, 7, 1, 3, 1, 5, 1, 5, 1, 7, 1, 7], [1, 3, 1, 3, 1]]
+  WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+SOLVER:
+  AMP:
+    ENABLED: False
+INPUT:
+  MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1280) for x in range(5, 21)]"]
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 1280
+  MAX_SIZE_TRAIN: 5120
+  MAX_SIZE_TEST: 5120
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (1280, 1280)
+    SINGLE_CATEGORY_MAX_AREA: 1.0
+  COLOR_AUG_SSD: True
+  SIZE_DIVISIBILITY: 1280  # used in dataset mapper
+  FORMAT: "RGB"
+TEST:
+  DETECTIONS_PER_IMAGE: 250
+  EVAL_PERIOD: 5000
+  AUG:
+    ENABLED: False
+    MIN_SIZES: [640, 960, 1280, 1600, 1920, 2240]
+    MAX_SIZE: 8960
+    FLIP: True

annotator/OneFormer/configs/ade20k/dinat/oneformer_dinat_large_bs16_160k_896x896.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+_BASE_: ../oneformer_R50_bs16_160k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2DiNAT"
+  DiNAT:
+    EMBED_DIM: 192
+    MLP_RATIO: 2.0
+    DEPTHS: [3, 4, 18, 5]
+    NUM_HEADS: [6, 12, 24, 48]
+    KERNEL_SIZE: 11
+    DROP_PATH_RATE: 0.3
+    DILATIONS: [[1, 20, 1], [1, 5, 1, 10], [1, 2, 1, 3, 1, 4, 1, 5, 1, 2, 1, 3, 1, 4, 1, 5, 1, 5], [1, 2, 1, 2, 1]]
+  WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+SOLVER:
+  AMP:
+    ENABLED: False
+INPUT:
+  MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 896) for x in range(5, 21)]"]
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 896
+  MAX_SIZE_TRAIN: 3584
+  MAX_SIZE_TEST: 3584
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (896, 896)
+    SINGLE_CATEGORY_MAX_AREA: 1.0
+  COLOR_AUG_SSD: True
+  SIZE_DIVISIBILITY: 896  # used in dataset mapper
+  FORMAT: "RGB"
+TEST:
+  DETECTIONS_PER_IMAGE: 250
+  EVAL_PERIOD: 5000
+  AUG:
+    ENABLED: False
+    MIN_SIZES: [448, 678, 896, 1120, 1344, 1568]
+    MAX_SIZE: 6272
+    FLIP: True

annotator/OneFormer/configs/ade20k/oneformer_R50_bs16_160k.yaml ADDED Viewed

	@@ -0,0 +1,58 @@

+_BASE_: Base-ADE20K-UnifiedSegmentation.yaml
+MODEL:
+  META_ARCHITECTURE: "OneFormer"
+  SEM_SEG_HEAD:
+    NAME: "OneFormerHead"
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 150
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 256
+    MASK_DIM: 256
+    NORM: "GN"
+    # pixel decoder
+    PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
+    COMMON_STRIDE: 4
+    TRANSFORMER_ENC_LAYERS: 6
+  ONE_FORMER:
+    TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder"
+    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    CLASS_WEIGHT: 2.0
+    MASK_WEIGHT: 5.0
+    DICE_WEIGHT: 5.0
+    CONTRASTIVE_WEIGHT: 0.5
+    CONTRASTIVE_TEMPERATURE: 0.07
+    HIDDEN_DIM: 256
+    NUM_OBJECT_QUERIES: 150
+    USE_TASK_NORM: True
+    NHEADS: 8
+    DROPOUT: 0.1
+    DIM_FEEDFORWARD: 2048
+    ENC_LAYERS: 0
+    PRE_NORM: False
+    ENFORCE_INPUT_PROJ: False
+    SIZE_DIVISIBILITY: 32
+    CLASS_DEC_LAYERS: 2
+    DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
+    TRAIN_NUM_POINTS: 12544
+    OVERSAMPLE_RATIO: 3.0
+    IMPORTANCE_SAMPLE_RATIO: 0.75
+  TEXT_ENCODER:
+    WIDTH: 256
+    CONTEXT_LENGTH: 77
+    NUM_LAYERS: 6
+    VOCAB_SIZE: 49408
+    PROJ_NUM_LAYERS: 2
+    N_CTX: 16
+  TEST:
+    SEMANTIC_ON: True
+    INSTANCE_ON: True
+    PANOPTIC_ON: True
+    OVERLAP_THRESHOLD: 0.8
+    OBJECT_MASK_THRESHOLD: 0.5
+    TASK: "panoptic"
+TEST:
+  DETECTIONS_PER_IMAGE: 150

annotator/OneFormer/configs/ade20k/swin/oneformer_swin_large_bs16_160k.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+_BASE_: ../oneformer_R50_bs16_160k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2SwinTransformer"
+  SWIN:
+    EMBED_DIM: 192
+    DEPTHS: [2, 2, 18, 2]
+    NUM_HEADS: [6, 12, 24, 48]
+    WINDOW_SIZE: 12
+    APE: False
+    DROP_PATH_RATE: 0.3
+    PATCH_NORM: True
+    PRETRAIN_IMG_SIZE: 384
+  WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+INPUT:
+  MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 640
+  MAX_SIZE_TRAIN: 2560
+  MAX_SIZE_TEST: 2560
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (640, 640)
+    SINGLE_CATEGORY_MAX_AREA: 1.0
+  COLOR_AUG_SSD: True
+  SIZE_DIVISIBILITY: 640  # used in dataset mapper
+  FORMAT: "RGB"
+TEST:
+  DETECTIONS_PER_IMAGE: 250
+  EVAL_PERIOD: 5000
+  AUG:
+    ENABLED: False
+    MIN_SIZES: [320, 480, 640, 800, 960, 1120]
+    MAX_SIZE: 4480
+    FLIP: True

annotator/OneFormer/configs/ade20k/swin/oneformer_swin_large_bs16_160k_1280x1280.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+_BASE_: ../oneformer_R50_bs16_160k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2SwinTransformer"
+  SWIN:
+    EMBED_DIM: 192
+    DEPTHS: [2, 2, 18, 2]
+    NUM_HEADS: [6, 12, 24, 48]
+    WINDOW_SIZE: 12
+    APE: False
+    DROP_PATH_RATE: 0.3
+    PATCH_NORM: True
+    PRETRAIN_IMG_SIZE: 384
+  WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+INPUT:
+  MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1280) for x in range(5, 21)]"]
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 1280
+  MAX_SIZE_TRAIN: 5120
+  MAX_SIZE_TEST: 5120
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (1280, 1280)
+    SINGLE_CATEGORY_MAX_AREA: 1.0
+  COLOR_AUG_SSD: True
+  SIZE_DIVISIBILITY: 1280  # used in dataset mapper
+  FORMAT: "RGB"
+TEST:
+  DETECTIONS_PER_IMAGE: 250
+  EVAL_PERIOD: 5000
+  AUG:
+    ENABLED: False
+    MIN_SIZES: [640, 960, 1280, 1600, 1920, 2240]
+    MAX_SIZE: 8960
+    FLIP: True

annotator/OneFormer/configs/ade20k/swin/oneformer_swin_large_bs16_160k_896x896.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+_BASE_: ../oneformer_R50_bs16_160k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2SwinTransformer"
+  SWIN:
+    EMBED_DIM: 192
+    DEPTHS: [2, 2, 18, 2]
+    NUM_HEADS: [6, 12, 24, 48]
+    WINDOW_SIZE: 12
+    APE: False
+    DROP_PATH_RATE: 0.3
+    PATCH_NORM: True
+    PRETRAIN_IMG_SIZE: 384
+  WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+INPUT:
+  MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 896) for x in range(5, 21)]"]
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 896
+  MAX_SIZE_TRAIN: 3584
+  MAX_SIZE_TEST: 3584
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (896, 896)
+    SINGLE_CATEGORY_MAX_AREA: 1.0
+  COLOR_AUG_SSD: True
+  SIZE_DIVISIBILITY: 896  # used in dataset mapper
+  FORMAT: "RGB"
+TEST:
+  DETECTIONS_PER_IMAGE: 250
+  EVAL_PERIOD: 5000
+  AUG:
+    ENABLED: False
+    MIN_SIZES: [448, 678, 896, 1120, 1344, 1568]
+    MAX_SIZE: 6272
+    FLIP: True

annotator/OneFormer/configs/ade20k/swin/oneformer_swin_tiny_bs16_160k.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+_BASE_: ../oneformer_R50_bs16_160k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2SwinTransformer"
+  SWIN:
+    EMBED_DIM: 96
+    DEPTHS: [2, 2, 6, 2]
+    NUM_HEADS: [3, 6, 12, 24]
+    WINDOW_SIZE: 7
+    APE: False
+    DROP_PATH_RATE: 0.3
+    PATCH_NORM: True
+  WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]

annotator/OneFormer/configs/cityscapes/Base-Cityscapes-UnifiedSegmentation.yaml ADDED Viewed

	@@ -0,0 +1,68 @@

+MODEL:
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: "build_resnet_backbone"
+  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  RESNETS:
+    DEPTH: 50
+    STEM_TYPE: "basic"  # not used
+    STEM_OUT_CHANNELS: 64
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+    NORM: "SyncBN"  # use syncbn for cityscapes dataset
+    RES5_MULTI_GRID: [1, 1, 1]  # not used
+DATASETS:
+  TRAIN: ("cityscapes_fine_panoptic_train",)
+  TEST_PANOPTIC: ("cityscapes_fine_panoptic_val",)
+  TEST_INSTANCE: ("cityscapes_fine_instance_seg_val",)
+  TEST_SEMANTIC: ("cityscapes_fine_sem_seg_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.0001
+  MAX_ITER: 90000
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 0
+  WEIGHT_DECAY: 0.05
+  OPTIMIZER: "ADAMW"
+  LR_SCHEDULER_NAME: "WarmupPolyLR"
+  BACKBONE_MULTIPLIER: 0.1
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.01
+    NORM_TYPE: 2.0
+  AMP:
+    ENABLED: True
+INPUT:
+  MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 1024
+  MAX_SIZE_TRAIN: 4096
+  MAX_SIZE_TEST: 2048
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (512, 1024)
+    SINGLE_CATEGORY_MAX_AREA: 1.0
+  COLOR_AUG_SSD: True
+  SIZE_DIVISIBILITY: -1
+  FORMAT: "RGB"
+  DATASET_MAPPER_NAME: "oneformer_unified"
+  MAX_SEQ_LEN: 77
+  TASK_SEQ_LEN: 77
+  TASK_PROB:
+    SEMANTIC: 0.33
+    INSTANCE: 0.66
+TEST:
+  EVAL_PERIOD: 5000
+  AUG:
+    ENABLED: False
+    MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
+    MAX_SIZE: 4096
+    FLIP: True
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+  NUM_WORKERS: 4
+VERSION: 2

annotator/OneFormer/configs/cityscapes/convnext/mapillary_pretrain_oneformer_convnext_large_bs16_90k.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+_BASE_: ../oneformer_R50_bs16_90k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2ConvNeXt"
+  CONVNEXT:
+    IN_CHANNELS: 3
+    DEPTHS: [3, 3, 27, 3]
+    DIMS: [192, 384, 768, 1536]
+    DROP_PATH_RATE: 0.4
+    LSIT: 1.0
+    OUT_INDICES: [0, 1, 2, 3]
+  WEIGHTS: "250_16_convnext_l_oneformer_mapillary_300k.pth"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+TEST:
+  DETECTIONS_PER_IMAGE: 250

annotator/OneFormer/configs/cityscapes/convnext/oneformer_convnext_large_bs16_90k.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+_BASE_: ../oneformer_R50_bs16_90k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2ConvNeXt"
+  CONVNEXT:
+    IN_CHANNELS: 3
+    DEPTHS: [3, 3, 27, 3]
+    DIMS: [192, 384, 768, 1536]
+    DROP_PATH_RATE: 0.4
+    LSIT: 1.0
+    OUT_INDICES: [0, 1, 2, 3]
+  WEIGHTS: "convnext_large_22k_1k_384.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+TEST:
+  DETECTIONS_PER_IMAGE: 250

annotator/OneFormer/configs/cityscapes/convnext/oneformer_convnext_xlarge_bs16_90k.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+_BASE_: ../oneformer_R50_bs16_90k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2ConvNeXt"
+  CONVNEXT:
+    IN_CHANNELS: 3
+    DEPTHS: [3, 3, 27, 3]
+    DIMS: [256, 512, 1024, 2048]
+    DROP_PATH_RATE: 0.4
+    LSIT: 1.0
+    OUT_INDICES: [0, 1, 2, 3]
+  WEIGHTS: "convnext_xlarge_22k_1k_384_ema.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+TEST:
+  DETECTIONS_PER_IMAGE: 250

annotator/OneFormer/configs/cityscapes/dinat/oneformer_dinat_large_bs16_90k.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+_BASE_: ../oneformer_R50_bs16_90k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2DiNAT"
+  DiNAT:
+    EMBED_DIM: 192
+    MLP_RATIO: 2.0
+    DEPTHS: [3, 4, 18, 5]
+    NUM_HEADS: [6, 12, 24, 48]
+    KERNEL_SIZE: 7
+    DROP_PATH_RATE: 0.3
+    DILATIONS: [[1, 18, 1], [1, 5, 1, 9], [1, 2, 1, 3, 1, 4, 1, 2, 1, 3, 1, 4, 1, 2, 1, 3, 1, 4], [1, 2, 1, 2, 1]]
+  WEIGHTS: "dinat_large_in22k_224.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+SOLVER:
+  AMP:
+    ENABLED: False
+TEST:
+  DETECTIONS_PER_IMAGE: 250

annotator/OneFormer/configs/cityscapes/oneformer_R50_bs16_90k.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+_BASE_: Base-Cityscapes-UnifiedSegmentation.yaml
+MODEL:
+  META_ARCHITECTURE: "OneFormer"
+  SEM_SEG_HEAD:
+    NAME: "OneFormerHead"
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 19
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 256
+    MASK_DIM: 256
+    NORM: "GN"
+    # pixel decoder
+    PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
+    COMMON_STRIDE: 4
+    TRANSFORMER_ENC_LAYERS: 6
+  ONE_FORMER:
+    TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder"
+    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    CLASS_WEIGHT: 2.0
+    MASK_WEIGHT: 5.0
+    DICE_WEIGHT: 5.0
+    CONTRASTIVE_WEIGHT: 0.5
+    CONTRASTIVE_TEMPERATURE: 0.07
+    HIDDEN_DIM: 256
+    NUM_OBJECT_QUERIES: 150
+    USE_TASK_NORM: True
+    NHEADS: 8
+    DROPOUT: 0.1
+    DIM_FEEDFORWARD: 2048
+    ENC_LAYERS: 0
+    PRE_NORM: False
+    ENFORCE_INPUT_PROJ: False
+    SIZE_DIVISIBILITY: 32
+    ENC_LAYERS: 0
+    CLASS_DEC_LAYERS: 2
+    DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
+    TRAIN_NUM_POINTS: 12544
+    OVERSAMPLE_RATIO: 3.0
+    IMPORTANCE_SAMPLE_RATIO: 0.75
+  TEXT_ENCODER:
+    WIDTH: 256
+    CONTEXT_LENGTH: 77
+    NUM_LAYERS: 6
+    VOCAB_SIZE: 49408
+    PROJ_NUM_LAYERS: 2
+    N_CTX: 16
+  TEST:
+    SEMANTIC_ON: True
+    INSTANCE_ON: True
+    PANOPTIC_ON: True
+    OVERLAP_THRESHOLD: 0.8
+    OBJECT_MASK_THRESHOLD: 0.8
+    TASK: "panoptic"
+TEST:
+  DETECTIONS_PER_IMAGE: 150

annotator/OneFormer/configs/cityscapes/swin/oneformer_swin_large_bs16_90k.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+_BASE_: ../oneformer_R50_bs16_90k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2SwinTransformer"
+  SWIN:
+    EMBED_DIM: 192
+    DEPTHS: [2, 2, 18, 2]
+    NUM_HEADS: [6, 12, 24, 48]
+    WINDOW_SIZE: 12
+    APE: False
+    DROP_PATH_RATE: 0.3
+    PATCH_NORM: True
+    PRETRAIN_IMG_SIZE: 384
+  WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+TEST:
+  DETECTIONS_PER_IMAGE: 250

annotator/OneFormer/configs/coco/Base-COCO-UnifiedSegmentation.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+MODEL:
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: "build_resnet_backbone"
+  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  RESNETS:
+    DEPTH: 50
+    STEM_TYPE: "basic"  # not used
+    STEM_OUT_CHANNELS: 64
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+    # NORM: "SyncBN"
+    RES5_MULTI_GRID: [1, 1, 1]  # not used
+DATASETS:
+  TRAIN: ("coco_2017_train_panoptic_with_sem_seg",)
+  TEST_PANOPTIC: ("coco_2017_val_panoptic_with_sem_seg",)  # to evaluate instance and semantic performance as well
+  TEST_INSTANCE: ("coco_2017_val",)
+  TEST_SEMANTIC: ("coco_2017_val_panoptic_with_sem_seg",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.0001
+  STEPS: (327778, 355092)
+  MAX_ITER: 368750
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WEIGHT_DECAY: 0.05
+  OPTIMIZER: "ADAMW"
+  BACKBONE_MULTIPLIER: 0.1
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.01
+    NORM_TYPE: 2.0
+  AMP:
+    ENABLED: True
+INPUT:
+  IMAGE_SIZE: 1024
+  MIN_SCALE: 0.1
+  MAX_SCALE: 2.0
+  FORMAT: "RGB"
+  DATASET_MAPPER_NAME: "coco_unified_lsj"
+  MAX_SEQ_LEN: 77
+  TASK_SEQ_LEN: 77
+  TASK_PROB:
+    SEMANTIC: 0.33
+    INSTANCE: 0.66
+TEST:
+  EVAL_PERIOD: 5000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+  NUM_WORKERS: 4
+VERSION: 2

annotator/OneFormer/configs/coco/dinat/oneformer_dinat_large_bs16_100ep.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+_BASE_: ../oneformer_R50_bs16_50ep.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2DiNAT"
+  DiNAT:
+    EMBED_DIM: 192
+    MLP_RATIO: 2.0
+    DEPTHS: [3, 4, 18, 5]
+    NUM_HEADS: [6, 12, 24, 48]
+    KERNEL_SIZE: 11
+    DROP_PATH_RATE: 0.3
+    DILATIONS: [[1, 20, 1], [1, 5, 1, 10], [1, 2, 1, 3, 1, 4, 1, 5, 1, 2, 1, 3, 1, 4, 1, 5, 1, 5], [1, 2, 1, 2, 1]]
+  WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 150
+SOLVER:
+  STEPS: (655556, 710184)
+  MAX_ITER: 737500
+  AMP:
+    ENABLED: False
+TEST:
+  DETECTIONS_PER_IMAGE: 150

annotator/OneFormer/configs/coco/oneformer_R50_bs16_50ep.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+_BASE_: Base-COCO-UnifiedSegmentation.yaml
+MODEL:
+  META_ARCHITECTURE: "OneFormer"
+  SEM_SEG_HEAD:
+    NAME: "OneFormerHead"
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 133
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 256
+    MASK_DIM: 256
+    NORM: "GN"
+    # pixel decoder
+    PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
+    COMMON_STRIDE: 4
+    TRANSFORMER_ENC_LAYERS: 6
+  ONE_FORMER:
+    TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder"
+    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    CLASS_WEIGHT: 2.0
+    MASK_WEIGHT: 5.0
+    DICE_WEIGHT: 5.0
+    CONTRASTIVE_WEIGHT: 0.5
+    CONTRASTIVE_TEMPERATURE: 0.07
+    HIDDEN_DIM: 256
+    NUM_OBJECT_QUERIES: 150
+    USE_TASK_NORM: True
+    NHEADS: 8
+    DROPOUT: 0.1
+    DIM_FEEDFORWARD: 2048
+    ENC_LAYERS: 0
+    PRE_NORM: False
+    ENFORCE_INPUT_PROJ: False
+    SIZE_DIVISIBILITY: 32
+    CLASS_DEC_LAYERS: 2
+    DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
+    TRAIN_NUM_POINTS: 12544
+    OVERSAMPLE_RATIO: 3.0
+    IMPORTANCE_SAMPLE_RATIO: 0.75
+  TEXT_ENCODER:
+    WIDTH: 256
+    CONTEXT_LENGTH: 77
+    NUM_LAYERS: 6
+    VOCAB_SIZE: 49408
+    PROJ_NUM_LAYERS: 2
+    N_CTX: 16
+  TEST:
+    SEMANTIC_ON: True
+    INSTANCE_ON: True
+    PANOPTIC_ON: True
+    DETECTION_ON: False
+    OVERLAP_THRESHOLD: 0.8
+    OBJECT_MASK_THRESHOLD: 0.8
+    TASK: "panoptic"
+TEST:
+  DETECTIONS_PER_IMAGE: 150

annotator/OneFormer/configs/coco/swin/oneformer_swin_large_bs16_100ep.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+_BASE_: ../oneformer_R50_bs16_50ep.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2SwinTransformer"
+  IS_DEMO: True
+  IS_TRAIN: False
+  SWIN:
+    EMBED_DIM: 192
+    DEPTHS: [2, 2, 18, 2]
+    NUM_HEADS: [6, 12, 24, 48]
+    WINDOW_SIZE: 12
+    APE: False
+    DROP_PATH_RATE: 0.3
+    PATCH_NORM: True
+    PRETRAIN_IMG_SIZE: 384
+  WEIGHTS: "./checkpoints/150_16_swin_l_oneformer_coco_100ep.pth"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 150
+SOLVER:
+  STEPS: (655556, 735184)
+  MAX_ITER: 737500
+  AMP:
+    ENABLED: False
+TEST:
+  DETECTIONS_PER_IMAGE: 150

annotator/OneFormer/configs/coco/swin/oneformer_swin_tiny_bs16_50ep.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+_BASE_: ../oneformer_R50_bs16_50ep.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2SwinTransformer"
+  SWIN:
+    EMBED_DIM: 96
+    DEPTHS: [2, 2, 6, 2]
+    NUM_HEADS: [3, 6, 12, 24]
+    WINDOW_SIZE: 7
+    APE: False
+    DROP_PATH_RATE: 0.3
+    PATCH_NORM: True
+  WEIGHTS: "swin_tiny_patch4_window7_224.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]

annotator/OneFormer/configs/mapillary_vistas/Base-Mapillary-UnifiedSegmentation.yaml ADDED Viewed

	@@ -0,0 +1,68 @@

+MODEL:
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: "build_resnet_backbone"
+  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  RESNETS:
+    DEPTH: 50
+    STEM_TYPE: "basic"  # not used
+    STEM_OUT_CHANNELS: 64
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+    NORM: "SyncBN"  # use syncbn for cityscapes dataset
+    RES5_MULTI_GRID: [1, 1, 1]  # not used
+DATASETS:
+  TRAIN: ("mapillary_vistas_panoptic_train",)
+  TEST_PANOPTIC: ("mapillary_vistas_panoptic_val",)
+  TEST_INSTANCE: ("mapillary_vistas_panoptic_val",)
+  TEST_SEMANTIC: ("mapillary_vistas_sem_seg_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.0001
+  MAX_ITER: 300000
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 0
+  WEIGHT_DECAY: 0.05
+  OPTIMIZER: "ADAMW"
+  LR_SCHEDULER_NAME: "WarmupPolyLR"
+  BACKBONE_MULTIPLIER: 0.1
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.01
+    NORM_TYPE: 2.0
+  AMP:
+    ENABLED: True
+INPUT:
+  MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 2048) for x in range(5, 21)]"]
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 2048
+  MAX_SIZE_TRAIN: 8192
+  MAX_SIZE_TEST: 2048
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (1024, 1024)
+    SINGLE_CATEGORY_MAX_AREA: 1.0
+  COLOR_AUG_SSD: True
+  SIZE_DIVISIBILITY: 1024  # used in dataset mapper
+  FORMAT: "RGB"
+  DATASET_MAPPER_NAME: "oneformer_unified"
+  MAX_SEQ_LEN: 77
+  TASK_SEQ_LEN: 77
+  TASK_PROB:
+    SEMANTIC: 0.50
+    INSTANCE: 0.50
+TEST:
+  EVAL_PERIOD: 30000
+  AUG:
+    ENABLED: False
+    MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
+    MAX_SIZE: 4096
+    FLIP: True
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+  NUM_WORKERS: 10
+VERSION: 2

annotator/OneFormer/configs/mapillary_vistas/convnext/cityscapes_pretrain_oneformer_convnext_large_bs16_300k.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+_BASE_: ../oneformer_R50_bs16_300k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2ConvNeXt"
+  CONVNEXT:
+    IN_CHANNELS: 3
+    DEPTHS: [3, 3, 27, 3]
+    DIMS: [192, 384, 768, 1536]
+    DROP_PATH_RATE: 0.4
+    LSIT: 1.0
+    OUT_INDICES: [0, 1, 2, 3]
+  WEIGHTS: "convnext_large_22k_1k_384.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+INPUT:
+  TASK_PROB:
+    SEMANTIC: 0.33
+    INSTANCE: 0.66
+TEST:
+  DETECTIONS_PER_IMAGE: 250

annotator/OneFormer/configs/mapillary_vistas/convnext/oneformer_convnext_large_bs16_300k.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+_BASE_: ../oneformer_R50_bs16_300k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2ConvNeXt"
+  CONVNEXT:
+    IN_CHANNELS: 3
+    DEPTHS: [3, 3, 27, 3]
+    DIMS: [192, 384, 768, 1536]
+    DROP_PATH_RATE: 0.4
+    LSIT: 1.0
+    OUT_INDICES: [0, 1, 2, 3]
+  WEIGHTS: "convnext_large_22k_1k_384.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+TEST:
+  DETECTIONS_PER_IMAGE: 250

annotator/OneFormer/configs/mapillary_vistas/dinat/oneformer_dinat_large_bs16_300k.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+_BASE_: ../oneformer_R50_bs16_300k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2DiNAT"
+  DiNAT:
+    EMBED_DIM: 192
+    MLP_RATIO: 2.0
+    DEPTHS: [3, 4, 18, 5]
+    NUM_HEADS: [6, 12, 24, 48]
+    KERNEL_SIZE: 11
+    DROP_PATH_RATE: 0.3
+    DILATIONS: [[1, 20, 1], [1, 5, 1, 10], [1, 2, 1, 3, 1, 4, 1, 5, 1, 2, 1, 3, 1, 4, 1, 5, 1, 5], [1, 2, 1, 2, 1]]
+  WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+SOLVER:
+  AMP:
+    ENABLED: False
+TEST:
+  DETECTIONS_PER_IMAGE: 250

annotator/OneFormer/configs/mapillary_vistas/oneformer_R50_bs16_300k.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+_BASE_: Base-Mapillary-UnifiedSegmentation.yaml
+MODEL:
+  META_ARCHITECTURE: "OneFormer"
+  SEM_SEG_HEAD:
+    NAME: "OneFormerHead"
+    IGNORE_VALUE: 65
+    NUM_CLASSES: 65
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 256
+    MASK_DIM: 256
+    NORM: "GN"
+    # pixel decoder
+    PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
+    COMMON_STRIDE: 4
+    TRANSFORMER_ENC_LAYERS: 6
+  ONE_FORMER:
+    TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder"
+    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    CLASS_WEIGHT: 2.0
+    MASK_WEIGHT: 5.0
+    DICE_WEIGHT: 5.0
+    CONTRASTIVE_WEIGHT: 0.5
+    CONTRASTIVE_TEMPERATURE: 0.07
+    HIDDEN_DIM: 256
+    NUM_OBJECT_QUERIES: 150
+    USE_TASK_NORM: True
+    NHEADS: 8
+    DROPOUT: 0.1
+    DIM_FEEDFORWARD: 2048
+    ENC_LAYERS: 0
+    PRE_NORM: False
+    ENFORCE_INPUT_PROJ: False
+    SIZE_DIVISIBILITY: 32
+    ENC_LAYERS: 0
+    CLASS_DEC_LAYERS: 2
+    DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
+    TRAIN_NUM_POINTS: 12544
+    OVERSAMPLE_RATIO: 3.0
+    IMPORTANCE_SAMPLE_RATIO: 0.75
+  TEXT_ENCODER:
+    WIDTH: 256
+    CONTEXT_LENGTH: 77
+    NUM_LAYERS: 6
+    VOCAB_SIZE: 49408
+    PROJ_NUM_LAYERS: 2
+    N_CTX: 16
+  TEST:
+    SEMANTIC_ON: True
+    INSTANCE_ON: True
+    PANOPTIC_ON: True
+    OVERLAP_THRESHOLD: 0.8
+    OBJECT_MASK_THRESHOLD: 0.8
+    TASK: "panoptic"
+TEST:
+  DETECTIONS_PER_IMAGE: 150

annotator/OneFormer/configs/mapillary_vistas/swin/oneformer_swin_large_bs16_300k.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+_BASE_: ../oneformer_R50_bs16_300k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2SwinTransformer"
+  SWIN:
+    EMBED_DIM: 192
+    DEPTHS: [2, 2, 18, 2]
+    NUM_HEADS: [6, 12, 24, 48]
+    WINDOW_SIZE: 12
+    APE: False
+    DROP_PATH_RATE: 0.3
+    PATCH_NORM: True
+    PRETRAIN_IMG_SIZE: 384
+  WEIGHTS: "swin_large_patch4_window12_384_22kto1k.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+TEST:
+  DETECTIONS_PER_IMAGE: 250

annotator/OneFormer/datasets/README.md ADDED Viewed

	@@ -0,0 +1,168 @@

+# Prepare Datasets for OneFormer
+- A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog) for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc).
+- This document explains how to setup the builtin datasets so they can be used by the above APIs. [Training OneFormer with Custom Datasets](https://github.com/SHI-Labs/OneFormer/tree/main/datasets/custom_datasets) gives a deeper dive on how to train OneFormer with custom datasets.
+- Detectron2 has builtin support for a few datasets. The datasets are assumed to exist in a directory specified by the environment variable `DETECTRON2_DATASETS`. Under this directory, detectron2 will look for datasets in the structure described below, if needed.
+  ```text
+  $DETECTRON2_DATASETS/
+    ADEChallengeData2016/
+    cityscapes/
+    coco/
+    mapillary_vistas/
+  ```
+- You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`. If left unset, the default is `./datasets` relative to your current working directory.
+## Expected dataset structure for [ADE20K](http://sceneparsing.csail.mit.edu/)
+```text
+ADEChallengeData2016/
+  images/
+  annotations/
+  objectInfo150.txt
+  # download instance annotation
+  annotations_instance/
+  # generated by prepare_ade20k_sem_seg.py
+  annotations_detectron2/
+  # below are generated by prepare_ade20k_pan_seg.py
+  ade20k_panoptic_{train,val}.json
+  ade20k_panoptic_{train,val}/
+  # below are generated by prepare_ade20k_ins_seg.py
+  ade20k_instance_{train,val}.json
+```
+- Generate `annotations_detectron2`:
+  ```bash
+  python datasets/prepare_ade20k_sem_seg.py
+  ```
+- Install panopticapi by:
+  ```bash
+  pip install git+https://github.com/cocodataset/panopticapi.git
+  ```
+- Download the instance annotation from <http://sceneparsing.csail.mit.edu/>:
+  ```bash
+  wget http://sceneparsing.csail.mit.edu/data/ChallengeData2017/annotations_instance.tar
+  ```
+- Then, run `python datasets/prepare_ade20k_pan_seg.py`, to combine semantic and instance annotations for panoptic annotations.
+- Run `python datasets/prepare_ade20k_ins_seg.py`, to extract instance annotations in COCO format.
+## Expected dataset structure for [Cityscapes](https://www.cityscapes-dataset.com/downloads/)
+```text
+cityscapes/
+  gtFine/
+    train/
+      aachen/
+        color.png, instanceIds.png, labelIds.png, polygons.json,
+        labelTrainIds.png
+      ...
+    val/
+    test/
+    # below are generated Cityscapes panoptic annotation
+    cityscapes_panoptic_train.json
+    cityscapes_panoptic_train/
+    cityscapes_panoptic_val.json
+    cityscapes_panoptic_val/
+    cityscapes_panoptic_test.json
+    cityscapes_panoptic_test/
+  leftImg8bit/
+    train/
+    val/
+    test/
+```
+- Login and download the dataset
+  ```bash
+  wget --keep-session-cookies --save-cookies=cookies.txt --post-data 'username=myusername&password=mypassword&submit=Login' https://www.cityscapes-dataset.com/login/
+  ######## gtFine
+  wget --load-cookies cookies.txt --content-disposition https://www.cityscapes-dataset.com/file-handling/?packageID=1
+  ######## leftImg8bit
+  wget --load-cookies cookies.txt --content-disposition https://www.cityscapes-dataset.com/file-handling/?packageID=3
+  ```
+- Install cityscapes scripts by:
+  ```bash
+  pip install git+https://github.com/mcordts/cityscapesScripts.git
+  ```
+- To create labelTrainIds.png, first prepare the above structure, then run cityscapesescript with:
+  ```bash
+  git clone https://github.com/mcordts/cityscapesScripts.git
+  ```
+  ```bash
+  CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesScripts/cityscapesscripts/preparation/createTrainIdLabelImgs.py
+  ```
+  These files are not needed for instance segmentation.
+- To generate Cityscapes panoptic dataset, run cityscapesescript with:
+  ```bash
+  CITYSCAPES_DATASET=/path/to/abovementioned/cityscapes python cityscapesScripts/cityscapesscripts/preparation/createPanopticImgs.py
+  ```
+  These files are not needed for semantic and instance segmentation.
+## Expected dataset structure for [COCO](https://cocodataset.org/#download)
+```text
+coco/
+  annotations/
+    instances_{train,val}2017.json
+    panoptic_{train,val}2017.json
+    caption_{train,val}2017.json
+    # evaluate on instance labels derived from panoptic annotations
+    panoptic2instances_val2017.json
+  {train,val}2017/
+    # image files that are mentioned in the corresponding json
+  panoptic_{train,val}2017/  # png annotations
+  panoptic_semseg_{train,val}2017/  # generated by the script mentioned below
+```
+- Install panopticapi by:
+  ```bash
+  pip install git+https://github.com/cocodataset/panopticapi.git
+  ```
+- Then, run `python datasets/prepare_coco_semantic_annos_from_panoptic_annos.py`, to extract semantic annotations from panoptic annotations (only used for evaluation).
+- Then run the following command to convert the panoptic json into instance json format (used for evaluation on instance segmentation task):
+  ```bash
+  python datasets/panoptic2detection_coco_format.py --things_only
+  ```
+## Expected dataset structure for [Mapillary Vistas](https://www.mapillary.com/dataset/vistas)
+```text
+mapillary_vistas/
+  training/
+    images/
+    instances/
+    labels/
+    panoptic/
+  validation/
+    images/
+    instances/
+    labels/
+    panoptic/
+  mapillary_vistas_instance_{train,val}.json  # generated by the script mentioned below
+```
+No preprocessing is needed for Mapillary Vistas on semantic and panoptic segmentation.
+We do not evaluate for the instance segmentation task on the Mapillary Vistas dataset.

annotator/OneFormer/datasets/ade20k_instance_catid_mapping.txt ADDED Viewed

	@@ -0,0 +1,104 @@

+Instacne100	SceneParse150	FullADE20K
+1		8		165
+2		9		3055
+3		11		350
+4		13		1831
+5		15		774
+5		15		783
+6		16		2684
+7		19		687
+8		20		471
+9		21		401
+10		23		1735
+11		24		2473
+12		25		2329
+13		28		1564
+14		31		57
+15		32		2272
+16		33		907
+17		34		724
+18		36		2985
+18		36		533
+19		37		1395
+20		38		155
+21		39		2053
+22		40		689
+23		42		266
+24		43		581
+25		44		2380
+26		45		491
+27		46		627
+28		48		2388
+29		50		943
+30		51		2096
+31		54		2530
+32		56		420
+33		57		1948
+34		58		1869
+35		59		2251
+36		63		239
+37		65		571
+38		66		2793
+39		67		978
+40		68		236
+41		70		181
+42		71		629
+43		72		2598
+44		73		1744
+45		74		1374
+46		75		591
+47		76		2679
+48		77		223
+49		79		47
+50		81		327
+51		82		2821
+52		83		1451
+53		84		2880
+54		86		480
+55		87		77
+56		88		2616
+57		89		246
+57		89		247
+58		90		2733
+59		91		14
+60		93		38
+61		94		1936
+62		96		120
+63		98		1702
+64		99		249
+65		103		2928
+66		104		2337
+67		105		1023
+68		108		2989
+69		109		1930
+70		111		2586
+71		112		131
+72		113		146
+73		116		95
+74		117		1563
+75		119		1708
+76		120		103
+77		121		1002
+78		122		2569
+79		124		2833
+80		125		1551
+81		126		1981
+82		127		29
+83		128		187
+84		130		747
+85		131		2254
+86		133		2262
+87		134		1260
+88		135		2243
+89		136		2932
+90		137		2836
+91		138		2850
+92		139		64
+93		140		894
+94		143		1919
+95		144		1583
+96		145		318
+97		147		2046
+98		148		1098
+99		149		530
+100		150		954

annotator/OneFormer/datasets/custom_datasets/README.md ADDED Viewed

	@@ -0,0 +1,35 @@

+# Training OneFormer with Custom Datasets
+OneFormer advocates the usage of panoptic annotations along with its task-conditioned joint training strategy. However, if panoptic annotations are not available, then also OneFormer can be trained using only the instance or semantic annotations on custom datasets. We provide some guidelines for training with custom datasets.
+## Register your New Dataset
+- OneFormer uses the information (class names, thing classes, etc.) stored in a dataset's metadata while preparing a dataset dictionary using a [`dataset_mapper`](https://github.com/SHI-Labs/OneFormer/tree/main/oneformer/data/dataset_mappers).
+- [Use Custom Datasets](https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html) gives a deeper dive into registering a new custom dataset.
+## Training with Available Panoptic Annotations
+- To prepare the dataset dictionary for each iteration during training, OneFormer uses a [`dataset_mapper`](https://github.com/SHI-Labs/OneFormer/tree/main/oneformer/data/dataset_mappers) class.
+- Originally, we provide two `dataset_mapper` classes which support task-conditioned joint training using the panoptic annotations:
+  - [`COCOUnifiedNewBaselineDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/5e04c9aaffd9bc73020d2238757f62346fe778c0/oneformer/data/dataset_mappers/coco_unified_new_baseline_dataset_mapper.py#L56): Specifically designed for COCO annotation format.
+  - [`OneFormerUnifiedDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/5e04c9aaffd9bc73020d2238757f62346fe778c0/oneformer/data/dataset_mappers/oneformer_unified_dataset_mapper.py#L26): General annotation format.
+- If you have panoptic annotations for your custom dataset, you may use these dataset_mapper classes directly after registering your dataset. You may also tune the [task sampling probabilities in the corresponding config file](https://github.com/SHI-Labs/OneFormer/blob/5e04c9aaffd9bc73020d2238757f62346fe778c0/configs/ade20k/Base-ADE20K-UnifiedSegmentation.yaml#L55).
+- If you want to train using only the instance or semantic annotation, please follow the next section on preparing a custom dataset mapper class.
+## Write a Custom Dataset Mapper Class
+- If you want to train using only instance or semantic annotations, write your custom dataset mapper class and add it to the [`build_train_loader`](https://github.com/SHI-Labs/OneFormer/blob/5e04c9aaffd9bc73020d2238757f62346fe778c0/train_net.py#L156) method.
+- We provide a few templates for custom dataset mappers:
+  - [`InstanceCOCOCustomNewBaselineDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/a7fae86ce5791a93132c059c1bdfc79c9f842820/datasets/custom_datasets/instance_coco_custom_dataset_mapper.py#L72): Specifically designed for COCO instance annotation format.
+  - [`InstanceOneFormerCustomDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/a7fae86ce5791a93132c059c1bdfc79c9f842820/datasets/custom_datasets/instance_oneformer_custom_dataset_mapper.py#L26): General instance annotation format.
+  - [`SemanticOneFormerCustomDatasetMapper`](https://github.com/SHI-Labs/OneFormer/blob/a7fae86ce5791a93132c059c1bdfc79c9f842820/datasets/custom_datasets/semantic_oneformer_custom_dataset_mapper.py#L26): General semantic annotation format.
+- Remember to register your custom dataset before training.
+Now you are all set to train OneFormer using your custom dataset!

annotator/OneFormer/datasets/custom_datasets/instance_coco_custom_dataset_mapper.py ADDED Viewed

	@@ -0,0 +1,235 @@

+# ------------------------------------------------------------------------------
+# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/coco_instance_new_baseline_dataset_mapper.py
+# Modified by Jitesh Jain (https://github.com/praeclarumjj3)
+# ------------------------------------------------------------------------------
+import copy
+import logging
+import numpy as np
+import torch
+from detectron2.data import MetadataCatalog
+from detectron2.config import configurable
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from oneformer.data.tokenizer import SimpleTokenizer, Tokenize
+from pycocotools import mask as coco_mask
+__all__ = ["InstanceCOCOCustomNewBaselineDatasetMapper"]
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+def build_transform_gen(cfg, is_train):
+    """
+    Create a list of default :class:`Augmentation` from config.
+    Now it includes resizing and flipping.
+    Returns:
+        list[Augmentation]
+    """
+    assert is_train, "Only support training augmentation"
+    image_size = cfg.INPUT.IMAGE_SIZE
+    min_scale = cfg.INPUT.MIN_SCALE
+    max_scale = cfg.INPUT.MAX_SCALE
+    augmentation = []
+    if cfg.INPUT.RANDOM_FLIP != "none":
+        augmentation.append(
+            T.RandomFlip(
+                horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
+                vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
+            )
+        )
+    augmentation.extend([
+        T.ResizeScale(
+            min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
+        ),
+        T.FixedSizeCrop(crop_size=(image_size, image_size)),
+    ])
+    return augmentation
+# This is specifically designed for the COCO Instance Segmentation dataset.
+class InstanceCOCOCustomNewBaselineDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by OneFormer for custom instance segmentation using COCO format.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+    @configurable
+    def __init__(
+        self,
+        is_train=True,
+        *,
+        num_queries,
+        tfm_gens,
+        meta,
+        image_format,
+        max_seq_len,
+        task_seq_len,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: for training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            crop_gen: crop augmentation
+            tfm_gens: data augmentation
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+        """
+        self.tfm_gens = tfm_gens
+        logging.getLogger(__name__).info(
+            "[InstanceCOCOCustomNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
+                str(self.tfm_gens)
+            )
+        )
+        self.img_format = image_format
+        self.is_train = is_train
+        self.meta = meta
+        self.num_queries = num_queries
+        self.things = []
+        for k,v in self.meta.thing_dataset_id_to_contiguous_id.items():
+            self.things.append(v)
+        self.class_names = self.meta.thing_classes
+        self.text_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=max_seq_len)
+        self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len)
+    @classmethod
+    def from_config(cls, cfg, is_train=True):
+        # Build augmentation
+        tfm_gens = build_transform_gen(cfg, is_train)
+        dataset_names = cfg.DATASETS.TRAIN
+        meta = MetadataCatalog.get(dataset_names[0])
+        ret = {
+            "is_train": is_train,
+            "meta": meta,
+            "tfm_gens": tfm_gens,
+            "image_format": cfg.INPUT.FORMAT,
+            "num_queries": cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES - cfg.MODEL.TEXT_ENCODER.N_CTX,
+            "task_seq_len": cfg.INPUT.TASK_SEQ_LEN,
+            "max_seq_len": cfg.INPUT.MAX_SEQ_LEN,
+        }
+        return ret
+    def _get_texts(self, classes, num_class_obj):
+        classes = list(np.array(classes))
+        texts = ["an instance photo"] * self.num_queries
+        for class_id in classes:
+            cls_name = self.class_names[class_id]
+            num_class_obj[cls_name] += 1
+        num = 0
+        for i, cls_name in enumerate(self.class_names):
+            if num_class_obj[cls_name] > 0:
+                for _ in range(num_class_obj[cls_name]):
+                    if num >= len(texts):
+                        break
+                    texts[num] = f"a photo with a {cls_name}"
+                    num += 1
+        return texts
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+        # TODO: get padding mask
+        # by feeding a "segmentation mask" to the same transforms
+        padding_mask = np.ones(image.shape[:2])
+        image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+        # the crop transformation has default padding value 0 for segmentation
+        padding_mask = transforms.apply_segmentation(padding_mask)
+        padding_mask = ~ padding_mask.astype(bool)
+        image_shape = image.shape[:2]  # h, w
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        dataset_dict["padding_mask"] = torch.as_tensor(np.ascontiguousarray(padding_mask))
+        if not self.is_train:
+            # USER: Modify this if you want to keep them for some reason.
+            dataset_dict.pop("annotations", None)
+            return dataset_dict
+        if "annotations" in dataset_dict:
+            # USER: Modify this if you want to keep them for some reason.
+            for anno in dataset_dict["annotations"]:
+                anno.pop("keypoints", None)
+            # USER: Implement additional transformations if you have other types of data
+            annos = [
+                utils.transform_instance_annotations(obj, transforms, image_shape)
+                for obj in dataset_dict.pop("annotations")
+                if obj.get("iscrowd", 0) == 0
+            ]
+            instances = utils.annotations_to_instances(annos, image_shape)
+            instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
+            # Need to filter empty instances first (due to augmentation)
+            instances = utils.filter_empty_instances(instances)
+            # Generate masks from polygon
+            h, w = instances.image_size
+            # image_size_xyxy = torch.as_tensor([w, h, w, h], dtype=torch.float)
+            if hasattr(instances, 'gt_masks'):
+                gt_masks = instances.gt_masks
+                gt_masks = convert_coco_poly_to_mask(gt_masks.polygons, h, w)
+                instances.gt_masks = gt_masks
+            num_class_obj = {}
+            for name in self.class_names:
+                num_class_obj[name] = 0
+            task = "The task is instance"
+            text = self._get_texts(instances.gt_classes, num_class_obj)
+            dataset_dict["instances"] = instances
+            dataset_dict["orig_shape"] = image_shape
+            dataset_dict["task"] = task
+            dataset_dict["text"] = text
+            dataset_dict["thing_ids"] = self.things
+        return dataset_dict

annotator/OneFormer/datasets/custom_datasets/instance_oneformer_custom_dataset_mapper.py ADDED Viewed

	@@ -0,0 +1,245 @@

+# ------------------------------------------------------------------------------
+# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/mask_former_instance_dataset_mapper.py
+# Modified by Jitesh Jain (https://github.com/praeclarumjj3)
+# ------------------------------------------------------------------------------
+import copy
+import logging
+import os
+import numpy as np
+import torch
+from torch.nn import functional as F
+from detectron2.config import configurable
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.structures import BitMasks, Instances, polygons_to_bitmask
+from detectron2.data import MetadataCatalog
+from detectron2.projects.point_rend import ColorAugSSDTransform
+from oneformer.data.tokenizer import SimpleTokenizer, Tokenize
+import pycocotools.mask as mask_util
+__all__ = ["InstanceOneFormerCustomDatasetMapper"]
+class InstanceOneFormerCustomDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by OneFormer custom instance segmentation.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+    @configurable
+    def __init__(
+        self,
+        is_train=True,
+        *,
+        name,
+        num_queries,
+        meta,
+        augmentations,
+        image_format,
+        size_divisibility,
+        task_seq_len,
+        max_seq_len,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: for training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+            ignore_label: the label that is ignored to evaluation
+            size_divisibility: pad image size to be divisible by this value
+        """
+        self.is_train = is_train
+        self.meta = meta
+        self.name = name
+        self.tfm_gens = augmentations
+        self.img_format = image_format
+        self.size_divisibility = size_divisibility
+        self.num_queries = num_queries
+        logger = logging.getLogger(__name__)
+        mode = "training" if is_train else "inference"
+        logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
+        self.things = []
+        for k,v in self.meta.thing_dataset_id_to_contiguous_id.items():
+            self.things.append(v)
+        self.class_names = self.meta.thing_classes
+        self.text_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=max_seq_len)
+        self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len)
+    @classmethod
+    def from_config(cls, cfg, is_train=True):
+        # Build augmentation
+        augs = [
+            T.ResizeShortestEdge(
+                cfg.INPUT.MIN_SIZE_TRAIN,
+                cfg.INPUT.MAX_SIZE_TRAIN,
+                cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
+            )
+        ]
+        if cfg.INPUT.CROP.ENABLED:
+            augs.append(
+                T.RandomCrop(
+                    cfg.INPUT.CROP.TYPE,
+                    cfg.INPUT.CROP.SIZE,
+                )
+            )
+        if cfg.INPUT.COLOR_AUG_SSD:
+            augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
+        augs.append(T.RandomFlip())
+        # Assume always applies to the training set.
+        dataset_names = cfg.DATASETS.TRAIN
+        meta = MetadataCatalog.get(dataset_names[0])
+        ret = {
+            "is_train": is_train,
+            "meta": meta,
+            "name": dataset_names[0],
+            "num_queries": cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES - cfg.MODEL.TEXT_ENCODER.N_CTX,
+            "task_seq_len": cfg.INPUT.TASK_SEQ_LEN,
+            "max_seq_len": cfg.INPUT.MAX_SEQ_LEN,
+            "augmentations": augs,
+            "image_format": cfg.INPUT.FORMAT,
+            "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
+            "semantic_prob": cfg.INPUT.TASK_PROB.SEMANTIC,
+            "instance_prob": cfg.INPUT.TASK_PROB.INSTANCE,
+        }
+        return ret
+    def _get_texts(self, classes, num_class_obj):
+        classes = list(np.array(classes))
+        texts = ["an instance photo"] * self.num_queries
+        for class_id in classes:
+            cls_name = self.class_names[class_id]
+            num_class_obj[cls_name] += 1
+        num = 0
+        for i, cls_name in enumerate(self.class_names):
+            if num_class_obj[cls_name] > 0:
+                for _ in range(num_class_obj[cls_name]):
+                    if num >= len(texts):
+                        break
+                    texts[num] = f"a photo with a {cls_name}"
+                    num += 1
+        return texts
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        assert self.is_train, "OneFormerDatasetMapper should only be used for training!"
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+        aug_input = T.AugInput(image)
+        aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
+        image = aug_input.image
+        # transform instnace masks
+        assert "annotations" in dataset_dict
+        for anno in dataset_dict["annotations"]:
+            anno.pop("keypoints", None)
+        annos = [
+            utils.transform_instance_annotations(obj, transforms, image.shape[:2])
+            for obj in dataset_dict.pop("annotations")
+            if obj.get("iscrowd", 0) == 0
+        ]
+        if len(annos):
+            assert "segmentation" in annos[0]
+        segms = [obj["segmentation"] for obj in annos]
+        masks = []
+        for segm in segms:
+            if isinstance(segm, list):
+                # polygon
+                masks.append(polygons_to_bitmask(segm, *image.shape[:2]))
+            elif isinstance(segm, dict):
+                # COCO RLE
+                masks.append(mask_util.decode(segm))
+            elif isinstance(segm, np.ndarray):
+                assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format(
+                    segm.ndim
+                )
+                # mask array
+                masks.append(segm)
+            else:
+                raise ValueError(
+                    "Cannot convert segmentation of type '{}' to BitMasks!"
+                    "Supported types are: polygons as list[list[float] or ndarray],"
+                    " COCO-style RLE as a dict, or a binary segmentation mask "
+                    " in a 2D numpy array of shape HxW.".format(type(segm))
+                )
+        # Pad image and segmentation label here!
+        image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks]
+        classes = [int(obj["category_id"]) for obj in annos]
+        classes = torch.tensor(classes, dtype=torch.int64)
+        if self.size_divisibility > 0:
+            image_size = (image.shape[-2], image.shape[-1])
+            padding_size = [
+                0,
+                self.size_divisibility - image_size[1],
+                0,
+                self.size_divisibility - image_size[0],
+            ]
+            # pad image
+            image = F.pad(image, padding_size, value=128).contiguous()
+            # pad mask
+            masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks]
+        image_shape = (image.shape[-2], image.shape[-1])  # h, w
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = image
+        # Prepare per-category binary masks
+        instances = Instances(image_shape)
+        instances.gt_classes = classes
+        if len(masks) == 0:
+            # Some image does not have annotation (all ignored)
+            instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1]))
+        else:
+            masks = BitMasks(torch.stack(masks))
+            instances.gt_masks = masks.tensor
+        num_class_obj = {}
+        for name in self.class_names:
+            num_class_obj[name] = 0
+        task = "The task is instance"
+        text = self._get_texts(instances.gt_classes, num_class_obj)
+        dataset_dict["instances"] = instances
+        dataset_dict["orig_shape"] = image_shape
+        dataset_dict["task"] = task
+        dataset_dict["text"] = text
+        dataset_dict["thing_ids"] = self.things
+        return dataset_dict

annotator/OneFormer/datasets/custom_datasets/semantic_oneformer_custom_dataset_mapper.py ADDED Viewed

	@@ -0,0 +1,238 @@

+# ------------------------------------------------------------------------------
+# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/mask_former_semantic_dataset_mapper.py
+# Modified by Jitesh Jain (https://github.com/praeclarumjj3)
+# ------------------------------------------------------------------------------
+import copy
+import logging
+import os
+import numpy as np
+import torch
+from torch.nn import functional as F
+from detectron2.config import configurable
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.structures import BitMasks, Instances, polygons_to_bitmask
+from detectron2.data import MetadataCatalog
+from detectron2.projects.point_rend import ColorAugSSDTransform
+from oneformer.data.tokenizer import SimpleTokenizer, Tokenize
+import pycocotools.mask as mask_util
+__all__ = ["SemanticOneFormerCustomDatasetMapper"]
+class SemanticOneFormerCustomDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by OneFormer custom semantic segmentation.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+    @configurable
+    def __init__(
+        self,
+        is_train=True,
+        *,
+        name,
+        num_queries,
+        meta,
+        augmentations,
+        image_format,
+        ignore_label,
+        size_divisibility,
+        task_seq_len,
+        max_seq_len,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: for training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+            ignore_label: the label that is ignored to evaluation
+            size_divisibility: pad image size to be divisible by this value
+        """
+        self.is_train = is_train
+        self.meta = meta
+        self.name = name
+        self.tfm_gens = augmentations
+        self.img_format = image_format
+        self.ignore_label = ignore_label
+        self.size_divisibility = size_divisibility
+        self.num_queries = num_queries
+        logger = logging.getLogger(__name__)
+        mode = "training" if is_train else "inference"
+        logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
+        self.class_names = self.meta.stuff_classes
+        self.text_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=max_seq_len)
+        self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len)
+    @classmethod
+    def from_config(cls, cfg, is_train=True):
+        # Build augmentation
+        augs = [
+            T.ResizeShortestEdge(
+                cfg.INPUT.MIN_SIZE_TRAIN,
+                cfg.INPUT.MAX_SIZE_TRAIN,
+                cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
+            )
+        ]
+        if cfg.INPUT.CROP.ENABLED:
+            augs.append(
+                T.RandomCrop_CategoryAreaConstraint(
+                    cfg.INPUT.CROP.TYPE,
+                    cfg.INPUT.CROP.SIZE,
+                    cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA,
+                    cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+                )
+            )
+        if cfg.INPUT.COLOR_AUG_SSD:
+            augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
+        augs.append(T.RandomFlip())
+        # Assume always applies to the training set.
+        dataset_names = cfg.DATASETS.TRAIN
+        meta = MetadataCatalog.get(dataset_names[0])
+        ignore_label = meta.ignore_label
+        ret = {
+            "is_train": is_train,
+            "meta": meta,
+            "name": dataset_names[0],
+            "num_queries": cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES - cfg.MODEL.TEXT_ENCODER.N_CTX,
+            "task_seq_len": cfg.INPUT.TASK_SEQ_LEN,
+            "max_seq_len": cfg.INPUT.MAX_SEQ_LEN,
+            "augmentations": augs,
+            "image_format": cfg.INPUT.FORMAT,
+            "ignore_label": ignore_label,
+            "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
+        }
+        return ret
+    def _get_texts(self, classes, num_class_obj):
+        classes = list(np.array(classes))
+        texts = ["an semantic photo"] * self.num_queries
+        for class_id in classes:
+            cls_name = self.class_names[class_id]
+            num_class_obj[cls_name] += 1
+        num = 0
+        for i, cls_name in enumerate(self.class_names):
+            if num_class_obj[cls_name] > 0:
+                for _ in range(num_class_obj[cls_name]):
+                    if num >= len(texts):
+                        break
+                    texts[num] = f"a photo with a {cls_name}"
+                    num += 1
+        return texts
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        assert self.is_train, "SemanticOneFormerCustomDatasetMapper should only be used for training!"
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+        if "sem_seg_file_name" in dataset_dict:
+            # PyTorch transformation not implemented for uint16, so converting it to double first
+            sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
+        else:
+            sem_seg_gt = None
+        if sem_seg_gt is None:
+            raise ValueError(
+                "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format(
+                    dataset_dict["file_name"]
+                )
+            )
+        aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
+        aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
+        image = aug_input.image
+        sem_seg_gt = aug_input.sem_seg
+        # Pad image and segmentation label here!
+        image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        if sem_seg_gt is not None:
+            sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
+        if self.size_divisibility > 0:
+            image_size = (image.shape[-2], image.shape[-1])
+            padding_size = [
+                0,
+                self.size_divisibility - image_size[1],
+                0,
+                self.size_divisibility - image_size[0],
+            ]
+            image = F.pad(image, padding_size, value=128).contiguous()
+            if sem_seg_gt is not None:
+                sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
+        image_shape = (image.shape[-2], image.shape[-1])  # h, w
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = image
+        if sem_seg_gt is not None:
+            dataset_dict["sem_seg"] = sem_seg_gt.long()
+        if "annotations" in dataset_dict:
+            raise ValueError("Semantic segmentation dataset should not have 'annotations'.")
+        # Prepare per-category binary masks
+        if sem_seg_gt is not None:
+            sem_seg_gt = sem_seg_gt.numpy()
+            instances = Instances(image_shape)
+            classes = np.unique(sem_seg_gt)
+            # remove ignored region
+            classes = classes[classes != self.ignore_label]
+            instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
+            masks = []
+            for class_id in classes:
+                masks.append(sem_seg_gt == class_id)
+            if len(masks) == 0:
+                # Some image does not have annotation (all ignored)
+                instances.gt_masks = torch.zeros((0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1]))
+            else:
+                masks = BitMasks(
+                    torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
+                )
+                instances.gt_masks = masks.tensor
+            num_class_obj = {}
+            for name in self.class_names:
+                num_class_obj[name] = 0
+            task = "The task is semantic"
+            text = self._get_texts(instances.gt_classes, num_class_obj)
+            dataset_dict["instances"] = instances
+            dataset_dict["orig_shape"] = image_shape
+            dataset_dict["task"] = task
+            dataset_dict["text"] = text
+        return dataset_dict

annotator/OneFormer/datasets/fg_ids.py ADDED Viewed

	@@ -0,0 +1,108 @@

+ADE20K_FG_IDS = {
+        1: 8,
+        2:	9,
+        3:	11,
+        4:	13,
+        5:	15,
+        5:	15,
+        6:	16,
+        7:	19,
+        8:	20,
+        9:	21,
+        10:	23,
+        11:	24,
+        12:	25,
+        13:	28,
+        14:	31,
+        15:	32,
+        16:	33,
+        17:	34,
+        18:	36,
+        18:	36,
+        19:	37,
+        20:	38,
+        21:	39,
+        22:	40,
+        23:	42,
+        24:	43,
+        25:	44,
+        26:	45,
+        27:	46,
+        28:	48,
+        29:	50,
+        30:	51,
+        31:	54,
+        32:	56,
+        33:	57,
+        34:	58,
+        35:	59,
+        36:	63,
+        37:	65,
+        38:	66,
+        39:	67,
+        40:	68,
+        41:	70,
+        42:	71,
+        43:	72,
+        44:	73,
+        45:	74,
+        46:	75,
+        47:	76,
+        48:	77,
+        49:	79,
+        50:	81,
+        51:	82,
+        52:	83,
+        53:	84,
+        54:	86,
+        55:	87,
+        56:	88,
+        57:	89,
+        57:	89,
+        58:	90,
+        59:	91,
+        60:	93,
+        61:	94,
+        62:	96,
+        63:	98,
+        64:	99,
+        65:	103,
+        66:	104,
+        67:	105,
+        68:	108,
+        69:	109,
+        70:	111,
+        71:	112,
+        72:	113,
+        73:	116,
+        74:	117,
+        75:	119,
+        76:	120,
+        77:	121,
+        78:	122,
+        79:	124,
+        80:	125,
+        81:	126,
+        82:	127,
+        83:	128,
+        84:	130,
+        85:	131,
+        86:	133,
+        87:	134,
+        88:	135,
+        89:	136,
+        90:	137,
+        91:	138,
+        92:	139,
+        93:	140,
+        94:	143,
+        95:	144,
+        96:	145,
+        97:	147,
+        98:	148,
+        99:	149,
+        100: 150
+    }
+CITYSCAPES_FG_NAMES = ['person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle']

annotator/OneFormer/datasets/panoptic2detection_coco_format.py ADDED Viewed

	@@ -0,0 +1,152 @@

+#!/usr/bin/env python
+# ------------------------------------------------------------------------------
+# Reference: https://github.com/cocodataset/panopticapi/blob/master/converters/panoptic2detection_coco_format.py
+# Modified by Jitesh Jain (https://github.com/praeclarumjj3)
+# ------------------------------------------------------------------------------
+'''
+This script converts panoptic COCO format to detection COCO format. More
+information about the formats can be found here:
+http://cocodataset.org/#format-data. All segments will be stored in RLE format.
+Additional option:
+- using option '--things_only' the script can discard all stuff
+segments, saving segments of things classes only.
+'''
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import os, sys
+import argparse
+import numpy as np
+import json
+import time
+import multiprocessing
+import PIL.Image as Image
+from panopticapi.utils import get_traceback, rgb2id, save_json
+try:
+    # set up path for pycocotools
+    # sys.path.append('./cocoapi-master/PythonAPI/')
+    from pycocotools import mask as COCOmask
+except Exception:
+    raise Exception("Please install pycocotools module from https://github.com/cocodataset/cocoapi")
+@get_traceback
+def convert_panoptic_to_detection_coco_format_single_core(
+    proc_id, annotations_set, categories, segmentations_folder, things_only
+):
+    annotations_detection = []
+    for working_idx, annotation in enumerate(annotations_set):
+        if working_idx % 100 == 0:
+            print('Core: {}, {} from {} images processed'.format(proc_id,
+                                                                 working_idx,
+                                                                 len(annotations_set)))
+        file_name = '{}.png'.format(annotation['file_name'].rsplit('.')[0])
+        try:
+            pan_format = np.array(
+                Image.open(os.path.join(segmentations_folder, file_name)), dtype=np.uint32
+            )
+        except IOError:
+            raise KeyError('no prediction png file for id: {}'.format(annotation['image_id']))
+        pan = rgb2id(pan_format)
+        for segm_info in annotation['segments_info']:
+            if things_only and categories[segm_info['category_id']]['isthing'] != 1:
+                continue
+            mask = (pan == segm_info['id']).astype(np.uint8)
+            mask = np.expand_dims(mask, axis=2)
+            segm_info.pop('id')
+            segm_info['image_id'] = annotation['image_id']
+            rle = COCOmask.encode(np.asfortranarray(mask))[0]
+            rle['counts'] = rle['counts'].decode('utf8')
+            segm_info['segmentation'] = rle
+            annotations_detection.append(segm_info)
+    print('Core: {}, all {} images processed'.format(proc_id, len(annotations_set)))
+    return annotations_detection
+def convert_panoptic_to_detection_coco_format(input_json_file,
+                                              segmentations_folder,
+                                              output_json_file,
+                                              categories_json_file,
+                                              things_only):
+    start_time = time.time()
+    if segmentations_folder is None:
+        segmentations_folder = input_json_file.rsplit('.', 1)[0]
+    print("CONVERTING...")
+    print("COCO panoptic format:")
+    print("\tSegmentation folder: {}".format(segmentations_folder))
+    print("\tJSON file: {}".format(input_json_file))
+    print("TO")
+    print("COCO detection format")
+    print("\tJSON file: {}".format(output_json_file))
+    if things_only:
+        print("Saving only segments of things classes.")
+    print('\n')
+    print("Reading annotation information from {}".format(input_json_file))
+    with open(input_json_file, 'r') as f:
+        d_coco = json.load(f)
+    annotations_panoptic = d_coco['annotations']
+    with open(categories_json_file, 'r') as f:
+        categories_list = json.load(f)
+    categories = {category['id']: category for category in categories_list}
+    cpu_num = multiprocessing.cpu_count()
+    annotations_split = np.array_split(annotations_panoptic, cpu_num)
+    print("Number of cores: {}, images per core: {}".format(cpu_num, len(annotations_split[0])))
+    workers = multiprocessing.Pool(processes=cpu_num)
+    processes = []
+    for proc_id, annotations_set in enumerate(annotations_split):
+        p = workers.apply_async(convert_panoptic_to_detection_coco_format_single_core,
+                                (proc_id, annotations_set, categories, segmentations_folder, things_only))
+        processes.append(p)
+    annotations_coco_detection = []
+    for p in processes:
+        annotations_coco_detection.extend(p.get())
+    for idx, ann in enumerate(annotations_coco_detection):
+        ann['id'] = idx
+    d_coco['annotations'] = annotations_coco_detection
+    categories_coco_detection = []
+    for category in d_coco['categories']:
+        if things_only and category['isthing'] != 1:
+            continue
+        category.pop('isthing')
+        categories_coco_detection.append(category)
+    d_coco['categories'] = categories_coco_detection
+    save_json(d_coco, output_json_file)
+    t_delta = time.time() - start_time
+    print("Time elapsed: {:0.2f} seconds".format(t_delta))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="The script converts panoptic COCO format to detection \
+         COCO format. See this file's head for more information."
+    )
+    parser.add_argument('--things_only', action='store_true',
+                        help="discard stuff classes")
+    args = parser.parse_args()
+    _root = os.getenv("DETECTRON2_DATASETS", "datasets")
+    root = os.path.join(_root, "coco")
+    input_json_file = os.path.join(root, "annotations", "panoptic_val2017.json")
+    output_json_file = os.path.join(root, "annotations", "panoptic2instances_val2017.json")
+    categories_json_file = "datasets/panoptic_coco_categories.json"
+    segmentations_folder = os.path.join(root, "panoptic_val2017")
+    convert_panoptic_to_detection_coco_format(input_json_file,
+                                              segmentations_folder,
+                                              output_json_file,
+                                              categories_json_file,
+                                              args.things_only)

annotator/OneFormer/datasets/prepare_ade20k_ins_seg.py ADDED Viewed

	@@ -0,0 +1,112 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+import glob
+import json
+import os
+from collections import Counter
+import numpy as np
+import tqdm
+from panopticapi.utils import IdGenerator, save_json
+from PIL import Image
+import pycocotools.mask as mask_util
+if __name__ == "__main__":
+    dataset_dir = os.getenv("DETECTRON2_DATASETS", "datasets")
+    for name, dirname in [("train", "training"), ("val", "validation")]:
+        image_dir = os.path.join(dataset_dir, f"ADEChallengeData2016/images/{dirname}/")
+        instance_dir = os.path.join(
+            dataset_dir, f"ADEChallengeData2016/annotations_instance/{dirname}/"
+        )
+        # img_id = 0
+        ann_id = 1
+        # json
+        out_file = os.path.join(dataset_dir, f"ADEChallengeData2016/ade20k_instance_{name}.json")
+        # json config
+        instance_config_file = "datasets/ade20k_instance_imgCatIds.json"
+        with open(instance_config_file) as f:
+            category_dict = json.load(f)["categories"]
+        # load catid mapping
+        # it is important to share category id for both instance and panoptic annotations
+        mapping_file = "datasets/ade20k_instance_catid_mapping.txt"
+        with open(mapping_file) as f:
+            map_id = {}
+            for i, line in enumerate(f.readlines()):
+                if i == 0:
+                    continue
+                ins_id, sem_id, _ = line.strip().split()
+                # shift id by 1 because we want it to start from 0!
+                # ignore_label becomes 255
+                map_id[int(ins_id)] = int(sem_id) - 1
+        for cat in category_dict:
+            cat["id"] = map_id[cat["id"]]
+        filenames = sorted(glob.glob(os.path.join(image_dir, "*.jpg")))
+        ann_dict = {}
+        images = []
+        annotations = []
+        for idx, filename in enumerate(tqdm.tqdm(filenames)):
+            image = {}
+            image_id = os.path.basename(filename).split(".")[0]
+            image["id"] = image_id
+            image["file_name"] = os.path.basename(filename)
+            original_format = np.array(Image.open(filename))
+            image["width"] = original_format.shape[1]
+            image["height"] = original_format.shape[0]
+            images.append(image)
+            filename_instance = os.path.join(instance_dir, image_id + ".png")
+            ins_seg = np.asarray(Image.open(filename_instance))
+            assert ins_seg.dtype == np.uint8
+            instance_cat_ids = ins_seg[..., 0]
+            # instance id starts from 1!
+            # because 0 is reserved as VOID label
+            instance_ins_ids = ins_seg[..., 1]
+            # process things
+            for thing_id in np.unique(instance_ins_ids):
+                if thing_id == 0:
+                    continue
+                mask = instance_ins_ids == thing_id
+                instance_cat_id = np.unique(instance_cat_ids[mask])
+                assert len(instance_cat_id) == 1
+                anno = {}
+                anno['id'] = ann_id
+                ann_id += 1
+                anno['image_id'] = image['id']
+                anno["iscrowd"] = int(0)
+                anno["category_id"] = int(map_id[instance_cat_id[0]])
+                inds = np.nonzero(mask)
+                ymin, ymax = inds[0].min(), inds[0].max()
+                xmin, xmax = inds[1].min(), inds[1].max()
+                anno["bbox"] = [int(xmin), int(ymin), int(xmax - xmin + 1), int(ymax - ymin + 1)]
+                # if xmax <= xmin or ymax <= ymin:
+                #     continue
+                rle = mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0]
+                rle["counts"] = rle["counts"].decode("utf-8")
+                anno["segmentation"] = rle
+                anno["area"] = int(mask_util.area(rle))
+                annotations.append(anno)
+        # save this
+        ann_dict['images'] = images
+        ann_dict['categories'] = category_dict
+        ann_dict['annotations'] = annotations
+        save_json(ann_dict, out_file)

annotator/OneFormer/datasets/prepare_ade20k_pan_seg.py ADDED Viewed

	@@ -0,0 +1,500 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+import glob
+import json
+import os
+from collections import Counter
+import numpy as np
+import tqdm
+from panopticapi.utils import IdGenerator, save_json
+from PIL import Image
+ADE20K_SEM_SEG_CATEGORIES = [
+    "wall",
+    "building",
+    "sky",
+    "floor",
+    "tree",
+    "ceiling",
+    "road, route",
+    "bed",
+    "window ",
+    "grass",
+    "cabinet",
+    "sidewalk, pavement",
+    "person",
+    "earth, ground",
+    "door",
+    "table",
+    "mountain, mount",
+    "plant",
+    "curtain",
+    "chair",
+    "car",
+    "water",
+    "painting, picture",
+    "sofa",
+    "shelf",
+    "house",
+    "sea",
+    "mirror",
+    "rug",
+    "field",
+    "armchair",
+    "seat",
+    "fence",
+    "desk",
+    "rock, stone",
+    "wardrobe, closet, press",
+    "lamp",
+    "tub",
+    "rail",
+    "cushion",
+    "base, pedestal, stand",
+    "box",
+    "column, pillar",
+    "signboard, sign",
+    "chest of drawers, chest, bureau, dresser",
+    "counter",
+    "sand",
+    "sink",
+    "skyscraper",
+    "fireplace",
+    "refrigerator, icebox",
+    "grandstand, covered stand",
+    "path",
+    "stairs",
+    "runway",
+    "case, display case, showcase, vitrine",
+    "pool table, billiard table, snooker table",
+    "pillow",
+    "screen door, screen",
+    "stairway, staircase",
+    "river",
+    "bridge, span",
+    "bookcase",
+    "blind, screen",
+    "coffee table",
+    "toilet, can, commode, crapper, pot, potty, stool, throne",
+    "flower",
+    "book",
+    "hill",
+    "bench",
+    "countertop",
+    "stove",
+    "palm, palm tree",
+    "kitchen island",
+    "computer",
+    "swivel chair",
+    "boat",
+    "bar",
+    "arcade machine",
+    "hovel, hut, hutch, shack, shanty",
+    "bus",
+    "towel",
+    "light",
+    "truck",
+    "tower",
+    "chandelier",
+    "awning, sunshade, sunblind",
+    "street lamp",
+    "booth",
+    "tv",
+    "plane",
+    "dirt track",
+    "clothes",
+    "pole",
+    "land, ground, soil",
+    "bannister, banister, balustrade, balusters, handrail",
+    "escalator, moving staircase, moving stairway",
+    "ottoman, pouf, pouffe, puff, hassock",
+    "bottle",
+    "buffet, counter, sideboard",
+    "poster, posting, placard, notice, bill, card",
+    "stage",
+    "van",
+    "ship",
+    "fountain",
+    "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
+    "canopy",
+    "washer, automatic washer, washing machine",
+    "plaything, toy",
+    "pool",
+    "stool",
+    "barrel, cask",
+    "basket, handbasket",
+    "falls",
+    "tent",
+    "bag",
+    "minibike, motorbike",
+    "cradle",
+    "oven",
+    "ball",
+    "food, solid food",
+    "step, stair",
+    "tank, storage tank",
+    "trade name",
+    "microwave",
+    "pot",
+    "animal",
+    "bicycle",
+    "lake",
+    "dishwasher",
+    "screen",
+    "blanket, cover",
+    "sculpture",
+    "hood, exhaust hood",
+    "sconce",
+    "vase",
+    "traffic light",
+    "tray",
+    "trash can",
+    "fan",
+    "pier",
+    "crt screen",
+    "plate",
+    "monitor",
+    "bulletin board",
+    "shower",
+    "radiator",
+    "glass, drinking glass",
+    "clock",
+    "flag",  # noqa
+]
+PALETTE = [
+    [120, 120, 120],
+    [180, 120, 120],
+    [6, 230, 230],
+    [80, 50, 50],
+    [4, 200, 3],
+    [120, 120, 80],
+    [140, 140, 140],
+    [204, 5, 255],
+    [230, 230, 230],
+    [4, 250, 7],
+    [224, 5, 255],
+    [235, 255, 7],
+    [150, 5, 61],
+    [120, 120, 70],
+    [8, 255, 51],
+    [255, 6, 82],
+    [143, 255, 140],
+    [204, 255, 4],
+    [255, 51, 7],
+    [204, 70, 3],
+    [0, 102, 200],
+    [61, 230, 250],
+    [255, 6, 51],
+    [11, 102, 255],
+    [255, 7, 71],
+    [255, 9, 224],
+    [9, 7, 230],
+    [220, 220, 220],
+    [255, 9, 92],
+    [112, 9, 255],
+    [8, 255, 214],
+    [7, 255, 224],
+    [255, 184, 6],
+    [10, 255, 71],
+    [255, 41, 10],
+    [7, 255, 255],
+    [224, 255, 8],
+    [102, 8, 255],
+    [255, 61, 6],
+    [255, 194, 7],
+    [255, 122, 8],
+    [0, 255, 20],
+    [255, 8, 41],
+    [255, 5, 153],
+    [6, 51, 255],
+    [235, 12, 255],
+    [160, 150, 20],
+    [0, 163, 255],
+    [140, 140, 200],
+    [250, 10, 15],
+    [20, 255, 0],
+    [31, 255, 0],
+    [255, 31, 0],
+    [255, 224, 0],
+    [153, 255, 0],
+    [0, 0, 255],
+    [255, 71, 0],
+    [0, 235, 255],
+    [0, 173, 255],
+    [31, 0, 255],
+    [11, 200, 200],
+    [255, 82, 0],
+    [0, 255, 245],
+    [0, 61, 255],
+    [0, 255, 112],
+    [0, 255, 133],
+    [255, 0, 0],
+    [255, 163, 0],
+    [255, 102, 0],
+    [194, 255, 0],
+    [0, 143, 255],
+    [51, 255, 0],
+    [0, 82, 255],
+    [0, 255, 41],
+    [0, 255, 173],
+    [10, 0, 255],
+    [173, 255, 0],
+    [0, 255, 153],
+    [255, 92, 0],
+    [255, 0, 255],
+    [255, 0, 245],
+    [255, 0, 102],
+    [255, 173, 0],
+    [255, 0, 20],
+    [255, 184, 184],
+    [0, 31, 255],
+    [0, 255, 61],
+    [0, 71, 255],
+    [255, 0, 204],
+    [0, 255, 194],
+    [0, 255, 82],
+    [0, 10, 255],
+    [0, 112, 255],
+    [51, 0, 255],
+    [0, 194, 255],
+    [0, 122, 255],
+    [0, 255, 163],
+    [255, 153, 0],
+    [0, 255, 10],
+    [255, 112, 0],
+    [143, 255, 0],
+    [82, 0, 255],
+    [163, 255, 0],
+    [255, 235, 0],
+    [8, 184, 170],
+    [133, 0, 255],
+    [0, 255, 92],
+    [184, 0, 255],
+    [255, 0, 31],
+    [0, 184, 255],
+    [0, 214, 255],
+    [255, 0, 112],
+    [92, 255, 0],
+    [0, 224, 255],
+    [112, 224, 255],
+    [70, 184, 160],
+    [163, 0, 255],
+    [153, 0, 255],
+    [71, 255, 0],
+    [255, 0, 163],
+    [255, 204, 0],
+    [255, 0, 143],
+    [0, 255, 235],
+    [133, 255, 0],
+    [255, 0, 235],
+    [245, 0, 255],
+    [255, 0, 122],
+    [255, 245, 0],
+    [10, 190, 212],
+    [214, 255, 0],
+    [0, 204, 255],
+    [20, 0, 255],
+    [255, 255, 0],
+    [0, 153, 255],
+    [0, 41, 255],
+    [0, 255, 204],
+    [41, 0, 255],
+    [41, 255, 0],
+    [173, 0, 255],
+    [0, 245, 255],
+    [71, 0, 255],
+    [122, 0, 255],
+    [0, 255, 184],
+    [0, 92, 255],
+    [184, 255, 0],
+    [0, 133, 255],
+    [255, 214, 0],
+    [25, 194, 194],
+    [102, 255, 0],
+    [92, 0, 255],
+]
+if __name__ == "__main__":
+    dataset_dir = os.getenv("DETECTRON2_DATASETS", "datasets")
+    for name, dirname in [("train", "training"), ("val", "validation")]:
+        image_dir = os.path.join(dataset_dir, f"ADEChallengeData2016/images/{dirname}/")
+        semantic_dir = os.path.join(dataset_dir, f"ADEChallengeData2016/annotations/{dirname}/")
+        instance_dir = os.path.join(
+            dataset_dir, f"ADEChallengeData2016/annotations_instance/{dirname}/"
+        )
+        # folder to store panoptic PNGs
+        out_folder = os.path.join(dataset_dir, f"ADEChallengeData2016/ade20k_panoptic_{name}/")
+        # json with segmentations information
+        out_file = os.path.join(dataset_dir, f"ADEChallengeData2016/ade20k_panoptic_{name}.json")
+        if not os.path.isdir(out_folder):
+            print("Creating folder {} for panoptic segmentation PNGs".format(out_folder))
+            os.mkdir(out_folder)
+        # json config
+        config_file = "datasets/ade20k_instance_imgCatIds.json"
+        with open(config_file) as f:
+            config = json.load(f)
+        # load catid mapping
+        mapping_file = "datasets/ade20k_instance_catid_mapping.txt"
+        with open(mapping_file) as f:
+            map_id = {}
+            for i, line in enumerate(f.readlines()):
+                if i == 0:
+                    continue
+                ins_id, sem_id, _ = line.strip().split()
+                # shift id by 1 because we want it to start from 0!
+                # ignore_label becomes 255
+                map_id[int(ins_id) - 1] = int(sem_id) - 1
+        ADE20K_150_CATEGORIES = []
+        for cat_id, cat_name in enumerate(ADE20K_SEM_SEG_CATEGORIES):
+            ADE20K_150_CATEGORIES.append(
+                {
+                    "name": cat_name,
+                    "id": cat_id,
+                    "isthing": int(cat_id in map_id.values()),
+                    "color": PALETTE[cat_id],
+                }
+            )
+        categories_dict = {cat["id"]: cat for cat in ADE20K_150_CATEGORIES}
+        panoptic_json_categories = ADE20K_150_CATEGORIES[:]
+        panoptic_json_images = []
+        panoptic_json_annotations = []
+        filenames = sorted(glob.glob(os.path.join(image_dir, "*.jpg")))
+        for idx, filename in enumerate(tqdm.tqdm(filenames)):
+            panoptic_json_image = {}
+            panoptic_json_annotation = {}
+            image_id = os.path.basename(filename).split(".")[0]
+            panoptic_json_image["id"] = image_id
+            panoptic_json_image["file_name"] = os.path.basename(filename)
+            original_format = np.array(Image.open(filename))
+            panoptic_json_image["width"] = original_format.shape[1]
+            panoptic_json_image["height"] = original_format.shape[0]
+            pan_seg = np.zeros(
+                (original_format.shape[0], original_format.shape[1], 3), dtype=np.uint8
+            )
+            id_generator = IdGenerator(categories_dict)
+            filename_semantic = os.path.join(semantic_dir, image_id + ".png")
+            filename_instance = os.path.join(instance_dir, image_id + ".png")
+            sem_seg = np.asarray(Image.open(filename_semantic))
+            ins_seg = np.asarray(Image.open(filename_instance))
+            assert sem_seg.dtype == np.uint8
+            assert ins_seg.dtype == np.uint8
+            semantic_cat_ids = sem_seg - 1
+            instance_cat_ids = ins_seg[..., 0] - 1
+            # instance id starts from 1!
+            # because 0 is reserved as VOID label
+            instance_ins_ids = ins_seg[..., 1]
+            segm_info = []
+            # NOTE: there is some overlap between semantic and instance annotation
+            # thus we paste stuffs first
+            # process stuffs
+            for semantic_cat_id in np.unique(semantic_cat_ids):
+                if semantic_cat_id == 255:
+                    continue
+                if categories_dict[semantic_cat_id]["isthing"]:
+                    continue
+                mask = semantic_cat_ids == semantic_cat_id
+                # should not have any overlap
+                assert pan_seg[mask].sum() == 0
+                segment_id, color = id_generator.get_id_and_color(semantic_cat_id)
+                pan_seg[mask] = color
+                area = np.sum(mask)  # segment area computation
+                # bbox computation for a segment
+                hor = np.sum(mask, axis=0)
+                hor_idx = np.nonzero(hor)[0]
+                x = hor_idx[0]
+                width = hor_idx[-1] - x + 1
+                vert = np.sum(mask, axis=1)
+                vert_idx = np.nonzero(vert)[0]
+                y = vert_idx[0]
+                height = vert_idx[-1] - y + 1
+                bbox = [int(x), int(y), int(width), int(height)]
+                segm_info.append(
+                    {
+                        "id": int(segment_id),
+                        "category_id": int(semantic_cat_id),
+                        "area": int(area),
+                        "bbox": bbox,
+                        "iscrowd": 0,
+                    }
+                )
+            # process things
+            for thing_id in np.unique(instance_ins_ids):
+                if thing_id == 0:
+                    continue
+                mask = instance_ins_ids == thing_id
+                instance_cat_id = np.unique(instance_cat_ids[mask])
+                assert len(instance_cat_id) == 1
+                semantic_cat_id = map_id[instance_cat_id[0]]
+                segment_id, color = id_generator.get_id_and_color(semantic_cat_id)
+                pan_seg[mask] = color
+                area = np.sum(mask)  # segment area computation
+                # bbox computation for a segment
+                hor = np.sum(mask, axis=0)
+                hor_idx = np.nonzero(hor)[0]
+                x = hor_idx[0]
+                width = hor_idx[-1] - x + 1
+                vert = np.sum(mask, axis=1)
+                vert_idx = np.nonzero(vert)[0]
+                y = vert_idx[0]
+                height = vert_idx[-1] - y + 1
+                bbox = [int(x), int(y), int(width), int(height)]
+                segm_info.append(
+                    {
+                        "id": int(segment_id),
+                        "category_id": int(semantic_cat_id),
+                        "area": int(area),
+                        "bbox": bbox,
+                        "iscrowd": 0,
+                    }
+                )
+            panoptic_json_annotation = {
+                "image_id": image_id,
+                "file_name": image_id + ".png",
+                "segments_info": segm_info,
+            }
+            Image.fromarray(pan_seg).save(os.path.join(out_folder, image_id + ".png"))
+            panoptic_json_images.append(panoptic_json_image)
+            panoptic_json_annotations.append(panoptic_json_annotation)
+        # save this
+        d = {
+            "images": panoptic_json_images,
+            "annotations": panoptic_json_annotations,
+            "categories": panoptic_json_categories,
+        }
+        save_json(d, out_file)

annotator/OneFormer/datasets/prepare_ade20k_sem_seg.py ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+import os
+from pathlib import Path
+import numpy as np
+import tqdm
+from PIL import Image
+def convert(input, output):
+    img = np.asarray(Image.open(input))
+    assert img.dtype == np.uint8
+    img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
+    Image.fromarray(img).save(output)
+if __name__ == "__main__":
+    dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
+    for name in ["training", "validation"]:
+        annotation_dir = dataset_dir / "annotations" / name
+        output_dir = dataset_dir / "annotations_detectron2" / name
+        output_dir.mkdir(parents=True, exist_ok=True)
+        for file in tqdm.tqdm(list(annotation_dir.iterdir())):
+            output_file = output_dir / file.name
+            convert(file, output_file)

annotator/OneFormer/datasets/prepare_coco_semantic_annos_from_panoptic_annos.py ADDED Viewed

	@@ -0,0 +1,84 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+import functools
+import json
+import multiprocessing as mp
+import numpy as np
+import os
+import time
+from fvcore.common.download import download
+from panopticapi.utils import rgb2id
+from PIL import Image
+from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
+def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map):
+    panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32)
+    panoptic = rgb2id(panoptic)
+    output = np.zeros_like(panoptic, dtype=np.uint8) + 255
+    for seg in segments:
+        cat_id = seg["category_id"]
+        new_cat_id = id_map[cat_id]
+        output[panoptic == seg["id"]] = new_cat_id
+    Image.fromarray(output).save(output_semantic)
+def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories):
+    """
+    Create semantic segmentation annotations from panoptic segmentation
+    annotations, to be used by PanopticFPN.
+    It maps all thing categories to class 0, and maps all unlabeled pixels to class 255.
+    It maps all stuff categories to contiguous ids starting from 1.
+    Args:
+        panoptic_json (str): path to the panoptic json file, in COCO's format.
+        panoptic_root (str): a directory with panoptic annotation files, in COCO's format.
+        sem_seg_root (str): a directory to output semantic annotation files
+        categories (list[dict]): category metadata. Each dict needs to have:
+            "id": corresponds to the "category_id" in the json annotations
+            "isthing": 0 or 1
+    """
+    os.makedirs(sem_seg_root, exist_ok=True)
+    id_map = {}  # map from category id to id in the output semantic annotation
+    assert len(categories) <= 254
+    for i, k in enumerate(categories):
+        id_map[k["id"]] = i
+    # what is id = 0?
+    # id_map[0] = 255
+    print(id_map)
+    with open(panoptic_json) as f:
+        obj = json.load(f)
+    pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4))
+    def iter_annotations():
+        for anno in obj["annotations"]:
+            file_name = anno["file_name"]
+            segments = anno["segments_info"]
+            input = os.path.join(panoptic_root, file_name)
+            output = os.path.join(sem_seg_root, file_name)
+            yield input, output, segments
+    print("Start writing to {} ...".format(sem_seg_root))
+    start = time.time()
+    pool.starmap(
+        functools.partial(_process_panoptic_to_semantic, id_map=id_map),
+        iter_annotations(),
+        chunksize=100,
+    )
+    print("Finished. time: {:.2f}s".format(time.time() - start))
+if __name__ == "__main__":
+    dataset_dir = os.path.join(os.getenv("DETECTRON2_DATASETS", "datasets"), "coco")
+    for s in ["val2017", "train2017"]:
+        separate_coco_semantic_from_panoptic(
+            os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)),
+            os.path.join(dataset_dir, "panoptic_{}".format(s)),
+            os.path.join(dataset_dir, "panoptic_semseg_{}".format(s)),
+            COCO_CATEGORIES,
+        )

annotator/OneFormer/demo/defaults.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# ------------------------------------------------------------------------------
+# Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/defaults.py
+# Modified by Jitesh Jain (https://github.com/praeclarumjj3)
+# ------------------------------------------------------------------------------
+import torch
+import detectron2.data.transforms as T
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.data import (
+    MetadataCatalog,
+)
+from detectron2.modeling import build_model
+__all__ = [
+    "DefaultPredictor",
+]
+class DefaultPredictor:
+    """
+    Create a simple end-to-end predictor with the given config that runs on
+    single device for a single input image.
+    Compared to using the model directly, this class does the following additions:
+    1. Load checkpoint from `cfg.MODEL.WEIGHTS`.
+    2. Always take BGR image as the input and apply conversion defined by `cfg.INPUT.FORMAT`.
+    3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`.
+    4. Take one input image and produce a single output, instead of a batch.
+    This is meant for simple demo purposes, so it does the above steps automatically.
+    This is not meant for benchmarks or running complicated inference logic.
+    If you'd like to do anything more complicated, please refer to its source code as
+    examples to build and use the model manually.
+    Attributes:
+        metadata (Metadata): the metadata of the underlying dataset, obtained from
+            cfg.DATASETS.TEST.
+    Examples:
+    ::
+        pred = DefaultPredictor(cfg)
+        inputs = cv2.imread("input.jpg")
+        outputs = pred(inputs)
+    """
+    def __init__(self, cfg):
+        self.cfg = cfg.clone()  # cfg can be modified by model
+        self.model = build_model(self.cfg)
+        self.model.eval()
+        if len(cfg.DATASETS.TEST):
+            self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
+        checkpointer = DetectionCheckpointer(self.model)
+        checkpointer.load(cfg.MODEL.WEIGHTS)
+        self.aug = T.ResizeShortestEdge(
+            [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
+        )
+        self.input_format = cfg.INPUT.FORMAT
+        assert self.input_format in ["RGB", "BGR"], self.input_format
+    def __call__(self, original_image, task):
+        """
+        Args:
+            original_image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+        Returns:
+            predictions (dict):
+                the output of the model for one image only.
+                See :doc:`/tutorials/models` for details about the format.
+        """
+        with torch.no_grad():  # https://github.com/sphinx-doc/sphinx/issues/4258
+            # Apply pre-processing to image.
+            if self.input_format == "RGB":
+                # whether the model expects BGR inputs or RGB
+                original_image = original_image[:, :, ::-1]
+            height, width = original_image.shape[:2]
+            image = self.aug.get_transform(original_image).apply_image(original_image)
+            image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
+            task = f"The task is {task}"
+            inputs = {"image": image, "height": height, "width": width, "task": task}
+            predictions = self.model([inputs])[0]
+            return predictions

annotator/OneFormer/oneformer/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from . import data  # register all new datasets
+from . import modeling
+# config
+from .config import *
+# dataset loading
+from .data.dataset_mappers.coco_unified_new_baseline_dataset_mapper import COCOUnifiedNewBaselineDatasetMapper
+from .data.dataset_mappers.oneformer_unified_dataset_mapper import (
+    OneFormerUnifiedDatasetMapper,
+)
+# models
+from .oneformer_model import OneFormer
+from .test_time_augmentation import SemanticSegmentorWithTTA
+# evaluation
+from .evaluation.instance_evaluation import InstanceSegEvaluator

annotator/OneFormer/oneformer/config.py ADDED Viewed

	@@ -0,0 +1,210 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+from detectron2.config import CfgNode as CN
+__all__ = ["add_common_config", "add_oneformer_config", "add_swin_config",
+            "add_dinat_config", "add_convnext_config"]
+def add_common_config(cfg):
+    """
+    Add config for common configuration
+    """
+    # data config
+    # select the dataset mapper
+    cfg.INPUT.DATASET_MAPPER_NAME = "oneformer_unified"
+    # Color augmentation
+    cfg.INPUT.COLOR_AUG_SSD = False
+    # We retry random cropping until no single category in semantic segmentation GT occupies more
+    # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
+    cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
+    # Pad image and segmentation GT in dataset mapper.
+    cfg.INPUT.SIZE_DIVISIBILITY = -1
+    cfg.INPUT.TASK_SEQ_LEN = 77
+    cfg.INPUT.MAX_SEQ_LEN = 77
+    cfg.INPUT.TASK_PROB = CN()
+    cfg.INPUT.TASK_PROB.SEMANTIC = 0.33
+    cfg.INPUT.TASK_PROB.INSTANCE = 0.66
+    # test dataset
+    cfg.DATASETS.TEST_PANOPTIC = ("",)
+    cfg.DATASETS.TEST_INSTANCE = ("",)
+    cfg.DATASETS.TEST_SEMANTIC = ("",)
+    # solver config
+    # weight decay on embedding
+    cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
+    # optimizer
+    cfg.SOLVER.OPTIMIZER = "ADAMW"
+    cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
+    # wandb
+    cfg.WANDB = CN()
+    cfg.WANDB.PROJECT = "OneFormer"
+    cfg.WANDB.NAME = None
+    cfg.MODEL.IS_TRAIN = True
+    cfg.MODEL.IS_DEMO = False
+    # text encoder config
+    cfg.MODEL.TEXT_ENCODER = CN()
+    cfg.MODEL.TEXT_ENCODER.WIDTH = 256
+    cfg.MODEL.TEXT_ENCODER.CONTEXT_LENGTH = 77
+    cfg.MODEL.TEXT_ENCODER.NUM_LAYERS = 12
+    cfg.MODEL.TEXT_ENCODER.VOCAB_SIZE = 49408
+    cfg.MODEL.TEXT_ENCODER.PROJ_NUM_LAYERS = 2
+    cfg.MODEL.TEXT_ENCODER.N_CTX = 16
+    # oneformer inference config
+    cfg.MODEL.TEST = CN()
+    cfg.MODEL.TEST.SEMANTIC_ON = True
+    cfg.MODEL.TEST.INSTANCE_ON = False
+    cfg.MODEL.TEST.PANOPTIC_ON = False
+    cfg.MODEL.TEST.DETECTION_ON = False
+    cfg.MODEL.TEST.OBJECT_MASK_THRESHOLD = 0.0
+    cfg.MODEL.TEST.OVERLAP_THRESHOLD = 0.0
+    cfg.MODEL.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
+    cfg.MODEL.TEST.TASK = "panoptic"
+    # TEST AUG Slide
+    cfg.TEST.AUG.IS_SLIDE = False
+    cfg.TEST.AUG.CROP_SIZE = (640, 640)
+    cfg.TEST.AUG.STRIDE = (426, 426)
+    cfg.TEST.AUG.SCALE = (2048, 640)
+    cfg.TEST.AUG.SETR_MULTI_SCALE = True
+    cfg.TEST.AUG.KEEP_RATIO = True
+    cfg.TEST.AUG.SIZE_DIVISOR = 32
+    # pixel decoder config
+    cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
+    # adding transformer in pixel decoder
+    cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
+    # pixel decoder
+    cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
+    cfg.MODEL.SEM_SEG_HEAD.SEM_EMBED_DIM = 256
+    cfg.MODEL.SEM_SEG_HEAD.INST_EMBED_DIM = 256
+    # LSJ aug
+    cfg.INPUT.IMAGE_SIZE = 1024
+    cfg.INPUT.MIN_SCALE = 0.1
+    cfg.INPUT.MAX_SCALE = 2.0
+    # MSDeformAttn encoder configs
+    cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
+    cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
+    cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8
+def add_oneformer_config(cfg):
+    """
+    Add config for ONE_FORMER.
+    """
+    # oneformer model config
+    cfg.MODEL.ONE_FORMER = CN()
+    # loss
+    cfg.MODEL.ONE_FORMER.DEEP_SUPERVISION = True
+    cfg.MODEL.ONE_FORMER.NO_OBJECT_WEIGHT = 0.1
+    cfg.MODEL.ONE_FORMER.CLASS_WEIGHT = 1.0
+    cfg.MODEL.ONE_FORMER.DICE_WEIGHT = 1.0
+    cfg.MODEL.ONE_FORMER.MASK_WEIGHT = 20.0
+    cfg.MODEL.ONE_FORMER.CONTRASTIVE_WEIGHT = 0.5
+    cfg.MODEL.ONE_FORMER.CONTRASTIVE_TEMPERATURE = 0.07
+    # transformer config
+    cfg.MODEL.ONE_FORMER.NHEADS = 8
+    cfg.MODEL.ONE_FORMER.DROPOUT = 0.1
+    cfg.MODEL.ONE_FORMER.DIM_FEEDFORWARD = 2048
+    cfg.MODEL.ONE_FORMER.ENC_LAYERS = 0
+    cfg.MODEL.ONE_FORMER.CLASS_DEC_LAYERS = 2
+    cfg.MODEL.ONE_FORMER.DEC_LAYERS = 6
+    cfg.MODEL.ONE_FORMER.PRE_NORM = False
+    cfg.MODEL.ONE_FORMER.HIDDEN_DIM = 256
+    cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES = 120
+    cfg.MODEL.ONE_FORMER.NUM_OBJECT_CTX = 16
+    cfg.MODEL.ONE_FORMER.USE_TASK_NORM = True
+    cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE = "res5"
+    cfg.MODEL.ONE_FORMER.ENFORCE_INPUT_PROJ = False
+    # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
+    # you can use this config to override
+    cfg.MODEL.ONE_FORMER.SIZE_DIVISIBILITY = 32
+    # transformer module
+    cfg.MODEL.ONE_FORMER.TRANSFORMER_DECODER_NAME = "ContrastiveMultiScaleMaskedTransformerDecoder"
+    # point loss configs
+    # Number of points sampled during training for a mask point head.
+    cfg.MODEL.ONE_FORMER.TRAIN_NUM_POINTS = 112 * 112
+    # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
+    # original paper.
+    cfg.MODEL.ONE_FORMER.OVERSAMPLE_RATIO = 3.0
+    # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
+    # the original paper.
+    cfg.MODEL.ONE_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75
+def add_swin_config(cfg):
+    """
+    Add config forSWIN Backbone.
+    """
+    # swin transformer backbone
+    cfg.MODEL.SWIN = CN()
+    cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
+    cfg.MODEL.SWIN.PATCH_SIZE = 4
+    cfg.MODEL.SWIN.EMBED_DIM = 96
+    cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
+    cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
+    cfg.MODEL.SWIN.WINDOW_SIZE = 7
+    cfg.MODEL.SWIN.MLP_RATIO = 4.0
+    cfg.MODEL.SWIN.QKV_BIAS = True
+    cfg.MODEL.SWIN.QK_SCALE = None
+    cfg.MODEL.SWIN.DROP_RATE = 0.0
+    cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
+    cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
+    cfg.MODEL.SWIN.APE = False
+    cfg.MODEL.SWIN.PATCH_NORM = True
+    cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
+    cfg.MODEL.SWIN.USE_CHECKPOINT = False
+def add_dinat_config(cfg):
+    """
+    Add config for NAT Backbone.
+    """
+    # DINAT transformer backbone
+    cfg.MODEL.DiNAT = CN()
+    cfg.MODEL.DiNAT.DEPTHS = [3, 4, 18, 5]
+    cfg.MODEL.DiNAT.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
+    cfg.MODEL.DiNAT.EMBED_DIM = 64
+    cfg.MODEL.DiNAT.MLP_RATIO = 3.0
+    cfg.MODEL.DiNAT.NUM_HEADS = [2, 4, 8, 16]
+    cfg.MODEL.DiNAT.DROP_PATH_RATE = 0.2
+    cfg.MODEL.DiNAT.KERNEL_SIZE = 7
+    cfg.MODEL.DiNAT.DILATIONS = [[1, 16, 1], [1, 4, 1, 8], [1, 2, 1, 3, 1, 4], [1, 2, 1, 2, 1]]
+    cfg.MODEL.DiNAT.OUT_INDICES = (0, 1, 2, 3)
+    cfg.MODEL.DiNAT.QKV_BIAS = True
+    cfg.MODEL.DiNAT.QK_SCALE = None
+    cfg.MODEL.DiNAT.DROP_RATE = 0
+    cfg.MODEL.DiNAT.ATTN_DROP_RATE = 0.
+    cfg.MODEL.DiNAT.IN_PATCH_SIZE = 4
+def add_convnext_config(cfg):
+    """
+    Add config for ConvNeXt Backbone.
+    """
+    # swin transformer backbone
+    cfg.MODEL.CONVNEXT = CN()
+    cfg.MODEL.CONVNEXT.IN_CHANNELS = 3
+    cfg.MODEL.CONVNEXT.DEPTHS = [3, 3, 27, 3]
+    cfg.MODEL.CONVNEXT.DIMS = [192, 384, 768, 1536]
+    cfg.MODEL.CONVNEXT.DROP_PATH_RATE = 0.4
+    cfg.MODEL.CONVNEXT.LSIT = 1.0
+    cfg.MODEL.CONVNEXT.OUT_INDICES = [0, 1, 2, 3]
+    cfg.MODEL.CONVNEXT.OUT_FEATURES = ["res2", "res3", "res4", "res5"]