Spaces:

ariG23498
/

clip-dinoiser

Sleeping

App Files Files Community

ariG23498 HF staff commited on Dec 22, 2023

Commit

d2ff88f

1 Parent(s): 78907b3

check

Browse files

Files changed (31) hide show

app.py +25 -12
clip_dinoiser.yaml +39 -0
models/__init__.py +2 -0
models/builder.py +16 -0
models/clip_dinoiser/__init__.py +1 -0
models/clip_dinoiser/clip_dinoiser.py +120 -0
models/maskclip/__init__.py +1 -0
models/maskclip/maskclip.py +221 -0
models/maskclip/utils/__init__.py +4 -0
models/maskclip/utils/embed.py +334 -0
models/maskclip/utils/prompt_templates.py +82 -0
models/maskclip/vit.py +470 -0
segmentation/configs/_base_/custom_import.py +12 -0
segmentation/configs/_base_/datasets/ade20k.py +58 -0
segmentation/configs/_base_/datasets/cityscapes.py +37 -0
segmentation/configs/_base_/datasets/coco.py +39 -0
segmentation/configs/_base_/datasets/pascal_context.py +38 -0
segmentation/configs/_base_/datasets/pascal_context59.py +38 -0
segmentation/configs/_base_/datasets/pascal_voc12.py +40 -0
segmentation/configs/_base_/datasets/pascal_voc12_20.py +40 -0
segmentation/configs/_base_/datasets/stuff.py +39 -0
segmentation/datasets/__init__.py +5 -0
segmentation/datasets/coco_object.py +42 -0
segmentation/datasets/coco_stuff.py +97 -0
segmentation/datasets/pascal_context.py +108 -0
segmentation/datasets/pascal_voc.py +40 -0
segmentation/datasets/pascal_voc20.py +40 -0
segmentation/evaluation/__init__.py +1 -0
segmentation/evaluation/builder.py +66 -0
segmentation/evaluation/clip_dinoiser_eval.py +34 -0
visualization.py +7 -0

app.py CHANGED Viewed

@@ -1,18 +1,8 @@
-import git
-git_url = "https://github.com/ariG23498/clip_dinoiser.git"
-repo_dir = "clip_dinoiser"
-git.Repo.clone_from(git_url, repo_dir)
-import os
-print(os.getcwd())
-os.chdir("clip_dinoiser/")
 from models.builder import build_model
-from utils.visualization import mask2rgb
 from segmentation.datasets import PascalVOCDataset
 from hydra import compose, initialize
 from PIL import Image
 import matplotlib.pyplot as plt
@@ -23,6 +13,29 @@ from operator import itemgetter
 import torch
 import warnings
 import gradio as gr
 def greet(name):

 from models.builder import build_model
+from visualization import mask2rgb
 from segmentation.datasets import PascalVOCDataset
+import os
 from hydra import compose, initialize
 from PIL import Image
 import matplotlib.pyplot as plt
 import torch
 import warnings
+warnings.filterwarnings("ignore")
+initialize(config_path="configs", version_base=None)
+from huggingface_hub import Repository
+repo = Repository(
+	local_dir="models",
+	clone_from="ariG23498/clip-dinoiser",
+	use_auth_token=os.environ.get("token")
+)
+check_path = 'models/checkpoints/last.pt'
+device = "cuda" if torch.cuda.is_available() else "cpu"
+check = torch.load(check_path, map_location=device)
+dinoclip_cfg = "clip_dinoiser.yaml"
+cfg = compose(config_name=dinoclip_cfg)
+model = build_model(cfg.model, class_names=PascalVOCDataset.CLASSES).to(device)
+model.clip_backbone.decode_head.use_templates=False # switching off the imagenet templates for fast inference
+model.load_state_dict(check['model_state_dict'], strict=False)
+model = model.eval()
 import gradio as gr
 def greet(name):

clip_dinoiser.yaml ADDED Viewed

	@@ -0,0 +1,39 @@

+_base_: "default.yml"
+defaults:
+  - _self_
+seed: 0
+model_name: clip_dinoiser
+model:
+  type: CLIP_DINOiser
+  clip_backbone: maskclip
+  mask_th: 0.2
+  in_dim: 256
+  certainty_th: 0.9
+  found_th: 0.5
+  feats_idx: -3
+checkpoint_path: "checkpoints/last.pt"
+output: logs
+evaluate:
+  eval_only: true
+  task:
+    - voc
+    - voc20
+    - context
+    - context59
+    - coco_stuff
+    - coco_object
+    - cityscapes
+    - ade20k
+  # evaluation
+  voc: segmentation/configs/_base_/datasets/pascal_voc12.py
+  voc20: segmentation/configs/_base_/datasets/pascal_voc12_20.py
+  context: segmentation/configs/_base_/datasets/pascal_context.py
+  context59: segmentation/configs/_base_/datasets/pascal_context59.py
+  coco_stuff: segmentation/configs/_base_/datasets/stuff.py
+  coco_object: segmentation/configs/_base_/datasets/coco.py
+  cityscapes: segmentation/configs/_base_/datasets/cityscapes.py
+  ade20k: segmentation/configs/_base_/datasets/ade20k.py

models/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .maskclip import *
2	+ from .clip_dinoiser import *

models/builder.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# ------------------------------------------------------------------------------
+# CLIP-DINOiser
+# author: Monika Wysoczanska
+# ------------------------------------------------------------------------------
+# Modified from GroupViT (https://github.com/NVlabs/GroupViT)
+# Copyright (c) 2021-22, NVIDIA Corporation & affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------------
+from mmcv.utils import Registry
+MODELS = Registry('models')
+from omegaconf import OmegaConf
+def build_model(config, class_names):
+    model = MODELS.build(OmegaConf.to_container(config, resolve=True),
+                         default_args={'class_names': class_names})
+    return model

models/clip_dinoiser/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .clip_dinoiser import *

models/clip_dinoiser/clip_dinoiser.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# ---------------------------------------------------------------------------------------------------
+# CLIP-DINOiser
+# authors: Monika Wysoczanska, Warsaw University of Technology & Oriane Simeoni, valeo.ai
+# ---------------------------------------------------------------------------------------------------
+import torch.nn as nn
+from models.builder import MODELS
+from models.builder import build_model
+import torch
+import torchvision.transforms as T
+from omegaconf import OmegaConf
+import torch.nn.functional as F
+NORMALIZE = T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
+@MODELS.register_module()
+class CLIP_DINOiser(nn.Module):
+    def __init__(self, clip_backbone, class_names, mask_th=None, found_th=0.5, certainty_th=0.9, apply_found=False,
+                 in_dim=256, conv_kernel=3, feats_idx=-3):
+        super(CLIP_DINOiser, self).__init__()
+        self.mask_th = mask_th
+        self.apply_found = apply_found
+        self.found_th = found_th
+        self.certainty_th = certainty_th
+        self.sigmoid = nn.Sigmoid()
+        maskclip_cfg = OmegaConf.load(f"configs/{clip_backbone}.yaml")
+        self.clip_backbone = build_model(maskclip_cfg["model"], class_names=class_names)
+        self.vit_patch_size = self.clip_backbone.patch_size
+        self.feats_idx = feats_idx
+        self.in_dim = [in_dim]
+        in_size = 768 if self.feats_idx != 'final' else 512
+        self.bkg_decoder = nn.Conv2d(in_size, 1, (1, 1))
+        self.obj_proj = nn.Conv2d(in_size, in_dim, (conv_kernel, conv_kernel),
+                                      padding=conv_kernel // 2, padding_mode='replicate')
+        # setup clip feature for training
+        if feats_idx != 'final':
+            train_feats = {}
+            def get_activation(name):
+                def hook(model, input, output):
+                    train_feats[name] = output.detach()
+                return hook
+            self.clip_backbone.backbone.layers[feats_idx].ln2.register_forward_hook(get_activation('clip_inter'))
+            self.train_feats = train_feats
+    def forward_pass(self, x):
+        clip_feats = self.get_clip_map(x)[0]
+        B, c_dim, h, w = clip_feats.shape
+        _, _, H, W = x.shape
+        if self.feats_idx != 'final':
+            clip_feats = self.train_feats['clip_inter']
+            c_dim = clip_feats.shape[-1]
+            clip_feats = clip_feats[:, 1:, ].permute(0, 2, 1).reshape(B, c_dim, h, w)
+        proj_feats = self.obj_proj(clip_feats).reshape(B, self.in_dim[-1], -1)
+        proj_feats = proj_feats / proj_feats.norm(dim=1, keepdim=True)
+        corrs = torch.matmul(proj_feats.permute(0, 2, 1), proj_feats).reshape(B,h*w, h, w)
+        output = clip_feats / clip_feats.norm(dim=1, keepdim=True)
+        output = self.bkg_decoder(output)
+        return output, corrs
+    def forward(self, x):
+        preds, corrs = self.forward_pass(x)
+        output, _, _ = self.get_clip_map(x)
+        B, C, hf, wf = output.shape
+        preds = F.interpolate(preds, (hf, wf), mode="bilinear", align_corners=False )
+        # Compute weighted pooling
+        if self.mask_th:
+             corrs[corrs < self.mask_th] = 0.0
+        output = self.compute_weighted_pool(output, corrs)
+        output = output.reshape(B, C, hf, wf)
+        output = self.clip_backbone.decode_head.cls_seg(output)
+        if self.apply_found:
+            # Compute FOUND --------------------------------------------------
+            soft_found = self.sigmoid(preds.detach())
+            r_soft_found = soft_found.reshape(-1)
+            nb_cls = output.shape[1]
+            r_hard_found = (r_soft_found > self.found_th).float()
+            # TODO: make it work for Batch Size != 1
+            uncertain = (output.max(dim=1)[0] < self.certainty_th).reshape(-1)
+            output.reshape(1, nb_cls, -1)[:, 0, uncertain & (~r_hard_found.bool())] = 1.0  # background class
+        return output
+    def predict(self, x):
+        return self(x)
+    @torch.no_grad()
+    def get_clip_map(self, img):
+        maskclip_map, feat, k = self.clip_backbone(img, return_feat=True)
+        return feat, k, maskclip_map
+    @torch.no_grad()
+    def compute_weighted_pool(self, clipmap, corrs):
+        # upsampling
+        B = clipmap.shape[0]
+        h_m, w_m = clipmap.shape[-2:]
+        h_w, w_w = corrs.shape[-2:]
+        if (h_m != h_w) or (w_m != w_w):
+            clipmap = F.interpolate(clipmap, (h_w, w_w), mode="bilinear", align_corners=False )
+            h_m, w_m = h_w, w_w
+        corrs[corrs < 0.0] = 0.0   # B HW H W
+        clipmap_refined = torch.einsum("bnij, bcij -> bcn", corrs, clipmap)  # B C HW
+        norm_factor = corrs.flatten(-2, -1).sum(dim=-1)[:, None] # B 1 HW
+        clipmap_refined = clipmap_refined / (norm_factor + 1e-6)
+        # RESHAPE back to 2d
+        clipmap_refined = clipmap_refined.reshape(B, -1, h_m, w_m)
+        return clipmap_refined

models/maskclip/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .maskclip import *

models/maskclip/maskclip.py ADDED Viewed

	@@ -0,0 +1,221 @@

+# ------------------------------------------------------------------------------
+# CLIP-DINOiser
+# author: Monika Wysoczanska, Warsaw University of Technology
+# ------------------------------------------------------------------------------
+# Modified from OpenMMLab https://github.com/chongzhou96/MaskCLIP
+# ------------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmseg.ops import resize
+from typing import Any, List
+from torch import Tensor
+from mmcv.utils import print_log
+from mmseg.utils import get_root_logger
+from open_clip import get_tokenizer,  create_model_from_pretrained
+from models.builder import MODELS
+from .vit import VisionTransformer
+import torchvision.transforms as T
+from .utils.embed import AdaptivePadding
+from .utils.prompt_templates import imagenet_templates
+OPENAI_NORMALIZE = T.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+def make_vision_transformer(backbone_cfg):
+    model = VisionTransformer(**backbone_cfg)
+    model.init_weights()
+    return model
+@MODELS.register_module()
+class MaskClip(nn.Module):
+    def __init__(
+            self,
+            backbone,
+            decode_head,
+            clip_model,
+            class_names
+        ):
+        super(MaskClip, self).__init__()
+        self.decode_head = eval(decode_head.get('type'))(clip_model, class_names, **decode_head)
+        self.backbone = make_vision_transformer(backbone)
+        self.clip_T = OPENAI_NORMALIZE
+        self.to_PIL = T.ToPILImage()
+        self.patch_size = backbone.get('patch_size')
+        self.padding = AdaptivePadding(self.patch_size, self.patch_size)
+    def extract_feat(self, inputs: Tensor) -> List[Tensor]:
+        """Extract features from images."""
+        x = self.backbone(inputs)
+        return x
+    def forward(self, inputs: Tensor, return_feat=False) -> Tensor:
+        """Encode images with backbone and decode into a semantic segmentation
+        map of the same size as input."""
+        inputs = self.clip_T(inputs)
+        x = self.extract_feat(inputs)
+        seg_logits, feats, k = self.decode_head(x, return_feat)
+        if return_feat:
+            return seg_logits, feats, k
+        return seg_logits
+class MaskClipHead(nn.Module):
+    def __init__(self, clip_model, class_names, visual_projs_path=None, in_index=-1, in_channels=3, norm_cfg=None, channels=0,
+                 text_channels=512, attn_pooling=False, align_corners=False, model_prefix='hf-hub:laion', use_templates=False, **kwargs):
+        super(MaskClipHead, self).__init__()
+        self.text_channels = text_channels
+        self.visual_projs_path = visual_projs_path
+        self.clip_model = clip_model
+        self.class_names = class_names
+        self.in_channels = in_channels
+        self.in_index = in_index # from base decode head default
+        self._init_inputs(in_channels, in_index, None)
+        self.channels = channels
+        self.norm_cfg = norm_cfg
+        self.align_corners = align_corners
+        self.use_templates = use_templates
+        self.proj = nn.Conv2d(self.in_channels, text_channels, 1, bias=False)
+        self.load_visual_projs()
+        self.attn_pooling = attn_pooling
+        self.tokenizer = get_tokenizer(f'{model_prefix}/{clip_model}')
+        self.hf_modelname = f'{model_prefix}/{clip_model}'
+        model, _ = create_model_from_pretrained(f'{model_prefix}/{clip_model}')
+        model.eval()
+        self.register_buffer("class_embeddings", self._get_class_embeddings(model, class_names))
+    @torch.no_grad()
+    def update_vocab(self, class_names):
+        model, _ = create_model_from_pretrained(self.hf_modelname)
+        model.eval()
+        self.class_embeddings = self._get_class_embeddings(model, class_names)
+    @torch.no_grad()
+    def _embed_label(self, text_model: torch.nn.Module, label: str) -> torch.Tensor:
+        """
+        Encode label name into a single vector
+        """
+        if self.use_templates:
+            templates = imagenet_templates
+        else:
+            templates = ['a photo of an {}' if label.startswith('aeiou') else 'a photo of a {}']
+        all_prompts = [self.tokenizer(template.format(label)) for template in templates]
+        out = text_model.encode_text(torch.cat(all_prompts))
+        out /= out.norm(dim=-1, keepdim=True)
+        out = out.mean(dim=0)
+        return out
+    def _get_class_embeddings(self, text_model: torch.nn.Module, class_names: List[str]):
+        aug_embeddings = torch.stack([self._embed_label(text_model, label) for label in class_names])
+        # normalize vector
+        aug_embeddings = aug_embeddings / aug_embeddings.norm(dim=-1, keepdim=True)
+        return aug_embeddings.squeeze(1)
+    def load_visual_projs(self):
+        loaded = torch.load(self.visual_projs_path, map_location='cuda')
+        attrs = ['proj']
+        for attr in attrs:
+            current_attr = getattr(self, attr)
+            state_dict = loaded[attr]
+            for key in state_dict:
+                if 'weight' in key:
+                    state_dict[key] = state_dict[key][:, :, None, None]
+            current_attr.load_state_dict(state_dict)
+        print_log(f'Loaded proj weights from {self.visual_projs_path}', logger=get_root_logger())
+    def forward(self, inputs, return_feat=False):
+        x = self._transform_inputs(inputs)
+        q, k, v, cls_token = None, None, None, None
+        if isinstance(x, list) and len(x) == 4:
+            x, q, k, v = x
+        if isinstance(x, list) and len(x) == 2:
+            x, cls_token = x
+        if v is not None:
+            feat = self.proj(v)
+        else:
+            feat = self.proj(x)
+        output = self.cls_seg(feat)
+        if return_feat:
+            return output, feat, k
+        return output
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+                'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                None: Only one select feature map is allowed.
+        """
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+    def cls_seg(self, feat):
+        feat = feat / feat.norm(dim=1, keepdim=True)
+        output = F.conv2d(feat, self.class_embeddings[:, :, None, None])
+        output = F.softmax(output * 100, dim=1) # softmax of similarities with temp scaling
+        return output
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+        return inputs

models/maskclip/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .embed import PatchEmbed
+from .prompt_templates import imagenet_templates
+__all__ = ['PatchEmbed', 'imagenet_templates']

models/maskclip/utils/embed.py ADDED Viewed

	@@ -0,0 +1,334 @@

+# From OpenMMLab https://github.com/chongzhou96/MaskCLIP
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------------
+import math
+from typing import Sequence
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner.base_module import BaseModule
+from mmcv.utils import to_2tuple
+class AdaptivePadding(nn.Module):
+    """Applies padding to input (if needed) so that input can get fully covered
+    by filter you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
+    input. The "corner"  mode would pad zero to bottom right.
+    Args:
+        kernel_size (int | tuple): Size of the kernel:
+        stride (int | tuple): Stride of the filter. Default: 1:
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1.
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+        super(AdaptivePadding, self).__init__()
+        assert padding in ('same', 'corner')
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+    def get_pad_shape(self, input_shape):
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+    def forward(self, x):
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ])
+        return x
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+    We use a conv layer to implement PatchEmbed.
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The config dict for embedding
+            conv layer type selection. Default: "Conv2d".
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int, optional): The slide stride of embedding conv.
+            Default: None (Would be set as `kernel_size`).
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only work when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
+            Default: None.
+    """
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=768,
+                 conv_type='Conv2d',
+                 kernel_size=16,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=True,
+                 norm_cfg=None,
+                 input_size=None,
+                 init_cfg=None):
+        super(PatchEmbed, self).__init__(init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adap_padding = None
+        padding = to_2tuple(padding)
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+        if input_size:
+            input_size = to_2tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adap_padding:
+                pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
+                input_h, input_w = input_size
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_h, input_w)
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+                - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (out_h, out_w).
+        """
+        if self.adap_padding:
+            x = self.adap_padding(x)
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map. Our implementation uses `nn.Unfold` to
+    merge patch, which is about 25% faster than original implementation.
+    Instead, we need to modify pretrained models for compatibility.
+    Args:
+        in_channels (int): The num of input channels.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=2,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=False,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+        if isinstance(padding, str):
+            self.adap_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adap_padding = None
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride)
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+    def forward(self, x, input_size):
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+                - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+                - out_size (tuple[int]): Spatial shape of x, arrange as
+                    (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f'Expect ' \
+                                                 f'input_size is ' \
+                                                 f'`Sequence` ' \
+                                                 f'but get {input_size}'
+        H, W = input_size
+        assert L == H * W, 'input feature has wrong size'
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+        if self.adap_padding:
+            x = self.adap_padding(x)
+            H, W = x.shape[-2:]
+        x = self.sampler(x)
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+                 (self.sampler.kernel_size[0] - 1) -
+                 1) // self.sampler.stride[0] + 1
+        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+                 (self.sampler.kernel_size[1] - 1) -
+                 1) // self.sampler.stride[1] + 1
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size

models/maskclip/utils/prompt_templates.py ADDED Viewed

	@@ -0,0 +1,82 @@

+imagenet_templates = [
+    'a bad photo of a {}.',
+    'a photo of many {}.',
+    'a sculpture of a {}.',
+    'a photo of the hard to see {}.',
+    'a low resolution photo of the {}.',
+    'a rendering of a {}.',
+    'graffiti of a {}.',
+    'a bad photo of the {}.',
+    'a cropped photo of the {}.',
+    'a tattoo of a {}.',
+    'the embroidered {}.',
+    'a photo of a hard to see {}.',
+    'a bright photo of a {}.',
+    'a photo of a clean {}.',
+    'a photo of a dirty {}.',
+    'a dark photo of the {}.',
+    'a drawing of a {}.',
+    'a photo of my {}.',
+    'the plastic {}.',
+    'a photo of the cool {}.',
+    'a close-up photo of a {}.',
+    'a black and white photo of the {}.',
+    'a painting of the {}.',
+    'a painting of a {}.',
+    'a pixelated photo of the {}.',
+    'a sculpture of the {}.',
+    'a bright photo of the {}.',
+    'a cropped photo of a {}.',
+    'a plastic {}.',
+    'a photo of the dirty {}.',
+    'a jpeg corrupted photo of a {}.',
+    'a blurry photo of the {}.',
+    'a photo of the {}.',
+    'a good photo of the {}.',
+    'a rendering of the {}.',
+    'a {} in a video game.',
+    'a photo of one {}.',
+    'a doodle of a {}.',
+    'a close-up photo of the {}.',
+    'a photo of a {}.',
+    'the origami {}.',
+    'the {} in a video game.',
+    'a sketch of a {}.',
+    'a doodle of the {}.',
+    'a origami {}.',
+    'a low resolution photo of a {}.',
+    'the toy {}.',
+    'a rendition of the {}.',
+    'a photo of the clean {}.',
+    'a photo of a large {}.',
+    'a rendition of a {}.',
+    'a photo of a nice {}.',
+    'a photo of a weird {}.',
+    'a blurry photo of a {}.',
+    'a cartoon {}.',
+    'art of a {}.',
+    'a sketch of the {}.',
+    'a embroidered {}.',
+    'a pixelated photo of a {}.',
+    'itap of the {}.',
+    'a jpeg corrupted photo of the {}.',
+    'a good photo of a {}.',
+    'a plushie {}.',
+    'a photo of the nice {}.',
+    'a photo of the small {}.',
+    'a photo of the weird {}.',
+    'the cartoon {}.',
+    'art of the {}.',
+    'a drawing of the {}.',
+    'a photo of the large {}.',
+    'a black and white photo of a {}.',
+    'the plushie {}.',
+    'a dark photo of a {}.',
+    'itap of a {}.',
+    'graffiti of the {}.',
+    'a toy {}.',
+    'itap of my {}.',
+    'a photo of a cool {}.',
+    'a photo of a small {}.',
+    'a tattoo of the {}.',
+]

models/maskclip/vit.py ADDED Viewed

	@@ -0,0 +1,470 @@

+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
+from mmcv.cnn.utils.weight_init import (constant_init, kaiming_init,
+                                        trunc_normal_)
+from mmcv.runner import BaseModule, ModuleList, _load_checkpoint
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.nn.modules.utils import _pair as to_2tuple
+import torch.nn.functional as F
+from mmseg.ops import resize
+from mmseg.utils import get_root_logger
+from models.maskclip.utils import PatchEmbed
+class TransformerEncoderLayer(BaseModule):
+    """Implements one encoder layer in Vision Transformer.
+    Args:
+        embed_dims (int): The feature dimension.
+        num_heads (int): Parallel attention heads.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        drop_rate (float): Probability of an element to be zeroed
+            after the feed forward layer. Default: 0.0.
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default: 0.0.
+        drop_path_rate (float): stochastic depth rate. Default 0.0.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        qkv_bias (bool): enable bias for qkv if True. Default: True
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default: True.
+    """
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 feedforward_channels,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 num_fcs=2,
+                 qkv_bias=True,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 batch_first=True):
+        super(TransformerEncoderLayer, self).__init__()
+        self.norm1_name, norm1 = build_norm_layer(
+            norm_cfg, embed_dims, postfix=1)
+        self.add_module(self.norm1_name, norm1)
+        self.attn = MultiheadAttention(
+            embed_dims=embed_dims,
+            num_heads=num_heads,
+            attn_drop=attn_drop_rate,
+            proj_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            batch_first=batch_first,
+            bias=qkv_bias)
+        self.norm2_name, norm2 = build_norm_layer(
+            norm_cfg, embed_dims, postfix=2)
+        self.add_module(self.norm2_name, norm2)
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=feedforward_channels,
+            num_fcs=num_fcs,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg)
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+    @property
+    def norm2(self):
+        return getattr(self, self.norm2_name)
+    def forward(self, x, return_qkv=False):
+        q, k, v = None, None, None
+        if return_qkv:
+            y = self.norm1(x)
+            y = F.linear(y, self.attn.attn.in_proj_weight, self.attn.attn.in_proj_bias)
+            N, L, C = y.shape
+            y = y.view(N, L, 3, C // 3).permute(2, 0, 1, 3).reshape(3 * N, L, C // 3)
+            y = F.linear(y, self.attn.attn.out_proj.weight, self.attn.attn.out_proj.bias)
+            q, k, v = y.tensor_split(3, dim=0)
+            v += x
+            v = self.ffn(self.norm2(v), identity=v)
+        x = self.attn(self.norm1(x), identity=x)
+        x = self.ffn(self.norm2(x), identity=x)
+        return x, q, k, v
+class VisionTransformer(BaseModule):
+    """Vision Transformer.
+    This backbone is the implementation of `An Image is Worth 16x16 Words:
+    Transformers for Image Recognition at
+    Scale <https://arxiv.org/abs/2010.11929>`_.
+    Args:
+        img_size (int | tuple): Input image size. Default: 224.
+        patch_size (int): The patch size. Default: 16.
+        in_channels (int): Number of input channels. Default: 3.
+        embed_dims (int): embedding dimension. Default: 768.
+        num_layers (int): depth of transformer. Default: 12.
+        num_heads (int): number of attention heads. Default: 12.
+        mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        out_indices (list | tuple | int): Output from which stages.
+            Default: -1.
+        qkv_bias (bool): enable bias for qkv if True. Default: True.
+        drop_rate (float): Probability of an element to be zeroed.
+            Default 0.0
+        attn_drop_rate (float): The drop out rate for attention layer.
+            Default 0.0
+        drop_path_rate (float): stochastic depth rate. Default 0.0
+        with_cls_token (bool): Whether concatenating class token into image
+            tokens as transformer input. Default: True.
+        output_cls_token (bool): Whether output the cls_token. If set True,
+            `with_cls_token` must be True. Default: False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        patch_norm (bool): Whether to add a norm in PatchEmbed Block.
+            Default: False.
+        final_norm (bool): Whether to add a additional layer to normalize
+            final feature map. Default: False.
+        interpolate_mode (str): Select the interpolate mode for position
+            embeding vector resize. Default: bicubic.
+        num_fcs (int): The number of fully-connected layers for FFNs.
+            Default: 2.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: False.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save
+            some memory while slowing down the training speed. Default: False.
+        pretrained (str, optional): model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 patch_bias=True,
+                 in_channels=3,
+                 embed_dims=768,
+                 num_layers=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 out_indices=-1,
+                 qkv_bias=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 with_cls_token=True,
+                 output_cls_token=False,
+                 norm_cfg=dict(type='LN'),
+                 act_cfg=dict(type='GELU'),
+                 patch_norm=False,
+                 pre_norm=False,
+                 final_norm=False,
+                 return_qkv=False,
+                 skip_last_attn=False,
+                 interpolate_mode='bicubic',
+                 num_fcs=2,
+                 norm_eval=False,
+                 with_cp=False,
+                 pretrained=None,
+                 init_cfg=None):
+        super(VisionTransformer, self).__init__(init_cfg=init_cfg)
+        if isinstance(img_size, int):
+            img_size = to_2tuple(img_size)
+        elif isinstance(img_size, tuple):
+            if len(img_size) == 1:
+                img_size = to_2tuple(img_size[0])
+            assert len(img_size) == 2, \
+                f'The size of image should have length 1 or 2, ' \
+                f'but got {len(img_size)}'
+        if output_cls_token:
+            assert with_cls_token is True, f'with_cls_token must be True if' \
+                                           f'set output_cls_token to True, but got {with_cls_token}'
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.interpolate_mode = interpolate_mode
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.pretrained = pretrained
+        self.patch_embed = PatchEmbed(
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            conv_type='Conv2d',
+            kernel_size=patch_size,
+            stride=patch_size,
+            padding='corner',
+            bias=patch_bias,
+            norm_cfg=norm_cfg if patch_norm else None,
+            init_cfg=None,
+        )
+        num_patches = (img_size[0] // patch_size) * \
+                      (img_size[1] // patch_size)
+        self.with_cls_token = with_cls_token
+        self.output_cls_token = output_cls_token
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims))
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + 1, embed_dims))
+        self.drop_after_pos = nn.Dropout(p=drop_rate)
+        if isinstance(out_indices, int):
+            if out_indices == -1:
+                out_indices = num_layers - 1
+            self.out_indices = [out_indices]
+        elif isinstance(out_indices, list) or isinstance(out_indices, tuple):
+            self.out_indices = out_indices
+        else:
+            raise TypeError('out_indices must be type of int, list or tuple')
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, num_layers)
+        ]  # stochastic depth decay rule
+        self.layers = ModuleList()
+        for i in range(num_layers):
+            self.layers.append(
+                TransformerEncoderLayer(
+                    embed_dims=embed_dims,
+                    num_heads=num_heads,
+                    feedforward_channels=mlp_ratio * embed_dims,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_rate=drop_rate,
+                    drop_path_rate=dpr[i],
+                    num_fcs=num_fcs,
+                    qkv_bias=qkv_bias,
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg,
+                    batch_first=True))
+        self.pre_norm = pre_norm
+        if pre_norm:
+            self.norm0_name, norm0 = build_norm_layer(
+                norm_cfg, embed_dims, postfix=0)
+            self.add_module(self.norm0_name, norm0)
+        self.final_norm = final_norm
+        if final_norm:
+            self.norm1_name, norm1 = build_norm_layer(
+                norm_cfg, embed_dims, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+        self.return_qkv = [False] * num_layers
+        if isinstance(return_qkv, bool):
+            for out_i in self.out_indices:
+                self.return_qkv[out_i] = return_qkv
+        elif isinstance(return_qkv, list) or isinstance(return_qkv, tuple):
+            for i, out_i in enumerate(self.out_indices):
+                self.return_qkv[out_i] = return_qkv[i]
+        else:
+            raise TypeError('return_qkv must be type of bool, list or tuple')
+        self.skip_last_attn = skip_last_attn
+    @property
+    def norm0(self):
+        return getattr(self, self.norm0_name)
+    @property
+    def norm1(self):
+        return getattr(self, self.norm1_name)
+    def init_weights(self):
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg.get('type') == 'Pretrained'):
+            logger = get_root_logger()
+            checkpoint = _load_checkpoint(
+                self.init_cfg['checkpoint'], logger=logger, map_location='cpu')
+            if 'state_dict' in checkpoint:
+                state_dict = checkpoint['state_dict']
+            else:
+                state_dict = checkpoint
+            if 'pos_embed' in state_dict.keys():
+                if self.pos_embed.shape != state_dict['pos_embed'].shape:
+                    logger.info(msg=f'Resize the pos_embed shape from '
+                                    f'{state_dict["pos_embed"].shape} to '
+                                    f'{self.pos_embed.shape}')
+                    h, w = self.img_size
+                    pos_size = int(
+                        math.sqrt(state_dict['pos_embed'].shape[1] - 1))
+                    state_dict['pos_embed'] = self.resize_pos_embed(
+                        state_dict['pos_embed'],
+                        (h // self.patch_size, w // self.patch_size),
+                        (pos_size, pos_size), self.interpolate_mode)
+            print(self.load_state_dict(state_dict, False))
+        elif self.init_cfg is not None:
+            super(VisionTransformer, self).init_weights()
+        else:
+            # We only implement the 'jax_impl' initialization implemented at
+            # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353  # noqa: E501
+            trunc_normal_(self.pos_embed, std=.02)
+            trunc_normal_(self.cls_token, std=.02)
+            for n, m in self.named_modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_(m.weight, std=.02)
+                    if m.bias is not None:
+                        if 'ffn' in n:
+                            nn.init.normal_(m.bias, mean=0., std=1e-6)
+                        else:
+                            nn.init.constant_(m.bias, 0)
+                elif isinstance(m, nn.Conv2d):
+                    kaiming_init(m, mode='fan_in', bias=0.)
+                elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
+                    constant_init(m, val=1.0, bias=0.)
+    def _pos_embeding(self, patched_img, hw_shape, pos_embed):
+        """Positiong embeding method.
+        Resize the pos_embed, if the input image size doesn't match
+            the training size.
+        Args:
+            patched_img (torch.Tensor): The patched image, it should be
+                shape of [B, L1, C].
+            hw_shape (tuple): The downsampled image resolution.
+            pos_embed (torch.Tensor): The pos_embed weighs, it should be
+                shape of [B, L2, c].
+        Return:
+            torch.Tensor: The pos encoded image feature.
+        """
+        assert patched_img.ndim == 3 and pos_embed.ndim == 3, \
+            'the shapes of patched_img and pos_embed must be [B, L, C]'
+        x_len, pos_len = patched_img.shape[1], pos_embed.shape[1]
+        if x_len != pos_len:
+            if pos_len == (self.img_size[0] // self.patch_size) * (
+                    self.img_size[1] // self.patch_size) + 1:
+                pos_h = self.img_size[0] // self.patch_size
+                pos_w = self.img_size[1] // self.patch_size
+            else:
+                raise ValueError(
+                    'Unexpected shape of pos_embed, got {}.'.format(
+                        pos_embed.shape))
+            pos_embed = self.resize_pos_embed(pos_embed, hw_shape,
+                                              (pos_h, pos_w),
+                                              self.interpolate_mode)
+        return self.drop_after_pos(patched_img + pos_embed)
+    @staticmethod
+    def resize_pos_embed(pos_embed, input_shpae, pos_shape, mode):
+        """Resize pos_embed weights.
+        Resize pos_embed using bicubic interpolate method.
+        Args:
+            pos_embed (torch.Tensor): Position embedding weights.
+            input_shpae (tuple): Tuple for (downsampled input image height,
+                downsampled input image width).
+            pos_shape (tuple): The resolution of downsampled origin training
+                image.
+            mode (str): Algorithm used for upsampling:
+                ``'nearest'`` | ``'linear'`` | ``'bilinear'`` | ``'bicubic'`` |
+                ``'trilinear'``. Default: ``'nearest'``
+        Return:
+            torch.Tensor: The resized pos_embed of shape [B, L_new, C]
+        """
+        assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]'
+        pos_h, pos_w = pos_shape
+        cls_token_weight = pos_embed[:, 0]
+        pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):]
+        pos_embed_weight = pos_embed_weight.reshape(
+            1, pos_h, pos_w, pos_embed.shape[2]).permute(0, 3, 1, 2)
+        pos_embed_weight = resize(
+            pos_embed_weight, size=input_shpae, align_corners=False, mode=mode)
+        cls_token_weight = cls_token_weight.unsqueeze(1)
+        pos_embed_weight = torch.flatten(pos_embed_weight, 2).transpose(1, 2)
+        pos_embed = torch.cat((cls_token_weight, pos_embed_weight), dim=1)
+        return pos_embed
+    def forward(self, inputs):
+        B = inputs.shape[0]
+        x, hw_shape = self.patch_embed(inputs)
+        # stole cls_tokens impl from Phil Wang, thanks
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = self._pos_embeding(x, hw_shape, self.pos_embed)
+        if not self.with_cls_token:
+            # Remove class token for transformer encoder input
+            x = x[:, 1:]
+        if self.pre_norm:
+            x = self.norm0(x)
+        outs = []
+        for i, layer in enumerate(self.layers):
+            x, q, k, v = layer(x, self.return_qkv[i] \
+                               or (i == len(self.layers) - 1 and self.skip_last_attn))
+            if i == len(self.layers) - 1:
+                if self.final_norm:
+                    x = self.norm1(x)
+                    if self.return_qkv[i]:
+                        v = self.norm1(v)
+                if self.skip_last_attn:
+                    if self.with_cls_token:
+                        x[:, 1:] = v[:, 1:]
+                    else:
+                        x = v
+            if i in self.out_indices:
+                if self.with_cls_token:
+                    # Remove class token and reshape token for decoder head
+                    out = x[:, 1:]
+                else:
+                    out = x
+                B, _, C = out.shape
+                out = out.reshape(B, hw_shape[0], hw_shape[1],
+                                  C).permute(0, 3, 1, 2).contiguous()
+                if self.output_cls_token:
+                    out = [out, x[:, 0]]
+                if self.return_qkv[i]:
+                    if self.with_cls_token:
+                        q = q[:, 1:]
+                        k = k[:, 1:]
+                        v = v[:, 1:]
+                    v = v.reshape(B, hw_shape[0], hw_shape[1],
+                                  C).permute(0, 3, 1, 2).contiguous()
+                    out = [out, q, k, v]
+                outs.append(out)
+        return tuple(outs)
+    def train(self, mode=True):
+        super(VisionTransformer, self).train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                if isinstance(m, nn.LayerNorm):
+                    m.eval()

segmentation/configs/_base_/custom_import.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# ---------------------------------------------------------------------------------------------------
+# CLIP-DINOiser
+# authors: Monika Wysoczanska, Warsaw University of Technology
+# ----------------------------------------------------------------------------------------------------
+# Modified from TCL
+# Copyright (c) 2023 Kakao Brain. All Rights Reserved.
+# ------------------------------------------------------------------------------
+custom_imports = dict(
+    imports=["segmentation.datasets.coco_object", "segmentation.datasets.pascal_voc", "datasets.transforms", "segmentation.datasets.pascal_voc20"],
+    allow_failed_imports=False,
+)

segmentation/configs/_base_/datasets/ade20k.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# ---------------------------------------------------------------------------------------------------
+# CLIP-DINOiser
+# authors: Monika Wysoczanska, Warsaw University of Technology
+# ----------------------------------------------------------------------------------------------------
+# Modified from TCL
+# Copyright (c) 2023 Kakao Brain. All Rights Reserved.
+# ------------------------------------------------------------------------------
+_base_ = ["../custom_import.py"]
+# dataset settings
+dataset_type = "ADE20KDataset"
+data_root = "./data"
+train_pipeline = [
+    dict(type="LoadImageFromFile"),
+    dict(type='ToRGB'),
+    dict(
+        type="MultiScaleFlipAug",
+        img_scale=(2048, 448),
+        flip=True,
+        transforms=[
+            dict(type='LoadImageFromFile'),
+            dict(type='ToRGB'),
+            dict(type='Resize', img_scale=(2048, 448)),
+            dict(type='RandomCrop', crop_size=(448, 448)),
+            dict(type='RandomFlip', prob=0.5),
+            dict(type='PhotoMetricDistortion'),
+            dict(type="ImageToTensorV2", keys=["img"]),
+            dict(type='Collect', keys=['img'], meta_keys=['ori_shape', 'img_shape', 'pad_shape', 'flip', 'img_info']),
+        ],
+    ),
+]
+test_pipeline = [
+    dict(type="LoadImageFromFile"),
+    dict(type='ToRGB'),
+    dict(
+        type="MultiScaleFlipAug",
+        img_scale=(2048, 448),
+        flip=False,
+        transforms=[
+            dict(type="Resize", keep_ratio=True),
+            dict(type="RandomFlip"),
+            dict(type="ImageToTensorV2", keys=["img"]),
+            dict(type="Collect", keys=["img"], meta_keys=['ori_shape', 'img_shape', 'pad_shape', 'flip', 'img_info']),
+        ],
+    ),
+]
+data = dict(
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir="ADEChallengeData2016/images/validation",
+        ann_dir="ADEChallengeData2016/annotations/validation",
+        pipeline=test_pipeline,
+    )
+)
+test_cfg = dict(mode="slide", stride=(224, 224), crop_size=(448, 448))

segmentation/configs/_base_/datasets/cityscapes.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# ---------------------------------------------------------------------------------------------------
+# CLIP-DINOiser
+# authors: Monika Wysoczanska, Warsaw University of Technology
+# ----------------------------------------------------------------------------------------------------
+# Modified from TCL
+# Copyright (c) 2023 Kakao Brain. All Rights Reserved.
+# ------------------------------------------------------------------------------
+_base_ = ["../custom_import.py"]
+# dataset settings
+dataset_type = "CityscapesDataset"
+data_root = "./data/cityscapes"
+test_pipeline = [
+    dict(type="LoadImageFromFile"),
+    dict(type='ToRGB'),
+    dict(
+        type="MultiScaleFlipAug",
+        img_scale=(2048, 448),
+        flip=False,
+        transforms=[
+            dict(type="Resize", keep_ratio=True),
+            dict(type="RandomFlip"),
+            dict(type="ImageToTensorV2", keys=["img"]),
+            dict(type="Collect", keys=["img"], meta_keys=['ori_shape', 'img_shape', 'pad_shape', 'flip', 'img_info']),
+        ],
+    ),
+]
+data = dict(
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir="leftImg8bit/val",
+        ann_dir="gtFine/val",
+        pipeline=test_pipeline,
+    )
+)
+test_cfg = dict(mode="slide", stride=(224, 224), crop_size=(448, 448))

segmentation/configs/_base_/datasets/coco.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# ---------------------------------------------------------------------------------------------------
+# CLIP-DINOiser
+# authors: Monika Wysoczanska, Warsaw University of Technology
+# ----------------------------------------------------------------------------------------------------
+# Modified from TCL
+# Copyright (c) 2023 Kakao Brain. All Rights Reserved.
+# ------------------------------------------------------------------------------
+_base_ = ["../custom_import.py"]
+# dataset settings
+dataset_type = "COCOObjectDataset"
+data_root = "./data/coco_stuff164k"
+test_pipeline = [
+    dict(type="LoadImageFromFile"),
+    dict(type='ToRGB'),
+    dict(
+        type="MultiScaleFlipAug",
+        img_scale=(2048, 448),
+        flip=False,
+        transforms=[
+            dict(type="Resize", keep_ratio=True),
+            dict(type="RandomFlip"),
+            dict(type="ImageToTensorV2", keys=["img"]),
+            dict(type="Collect", keys=["img"], meta_keys=['ori_shape', 'img_shape', 'pad_shape', 'flip', 'img_info']),
+        ],
+    ),
+]
+data = dict(
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir="images/val2017",
+        ann_dir="annotations/val2017",
+        pipeline=test_pipeline,
+    )
+)
+test_cfg = dict(mode="slide", stride=(224, 224), crop_size=(448, 448))

segmentation/configs/_base_/datasets/pascal_context.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# ---------------------------------------------------------------------------------------------------
+# CLIP-DINOiser
+# authors: Monika Wysoczanska, Warsaw University of Technology
+# ----------------------------------------------------------------------------------------------------
+# Modified from TCL
+# Copyright (c) 2023 Kakao Brain. All Rights Reserved.
+# ------------------------------------------------------------------------------
+_base_ = ["../custom_import.py"]
+# dataset settings
+dataset_type = "PascalContextDataset"
+data_root = "./data/VOCdevkit/VOC2010"
+test_pipeline = [
+    dict(type="LoadImageFromFile"),
+    dict(type='ToRGB'),
+    dict(
+        type="MultiScaleFlipAug",
+        img_scale=(2048, 448),
+        flip=False,
+        transforms=[
+            dict(type="Resize", keep_ratio=True),
+            dict(type="RandomFlip"),
+            dict(type="ImageToTensorV2", keys=["img"]),
+            dict(type="Collect", keys=["img"], meta_keys=['ori_shape', 'img_shape', 'pad_shape', 'flip', 'img_info']),
+        ],
+    ),
+]
+data = dict(
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir="JPEGImages",
+        ann_dir="SegmentationClassContext",
+        split="ImageSets/SegmentationContext/val.txt",
+        pipeline=test_pipeline,
+    )
+)
+test_cfg = dict(mode="slide", stride=(224, 224), crop_size=(448, 448))

segmentation/configs/_base_/datasets/pascal_context59.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# ---------------------------------------------------------------------------------------------------
+# CLIP-DINOiser
+# authors: Monika Wysoczanska, Warsaw University of Technology
+# ----------------------------------------------------------------------------------------------------
+# Modified from TCL
+# Copyright (c) 2023 Kakao Brain. All Rights Reserved.
+# ------------------------------------------------------------------------------
+# dataset settings
+dataset_type = "PascalContextDataset59"
+data_root = "./data/VOCdevkit/VOC2010"
+test_pipeline = [
+    dict(type="LoadImageFromFile"),
+    dict(type='ToRGB'),
+    dict(
+        type="MultiScaleFlipAug",
+        img_scale=(2048, 448),
+        flip=False,
+        transforms=[
+            dict(type="Resize", keep_ratio=True),
+            dict(type="RandomFlip"),
+            dict(type="ImageToTensorV2", keys=["img"]),
+            dict(type="Collect", keys=["img"],
+                 meta_keys=['ori_shape', 'img_shape', 'pad_shape', 'flip', 'img_info']),
+        ],
+    ),
+]
+data = dict(
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir="JPEGImages",
+        ann_dir="SegmentationClassContext",
+        split="ImageSets/SegmentationContext/val.txt",
+        pipeline=test_pipeline,
+    )
+)
+test_cfg = dict(mode="slide", stride=(224, 224), crop_size=(448, 448))

segmentation/configs/_base_/datasets/pascal_voc12.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# ---------------------------------------------------------------------------------------------------
+# CLIP-DINOiser
+# authors: Monika Wysoczanska, Warsaw University of Technology
+# ----------------------------------------------------------------------------------------------------
+# Modified from TCL
+# Copyright (c) 2023 Kakao Brain. All Rights Reserved.
+# ------------------------------------------------------------------------------
+_base_ = ["../custom_import.py"]
+# dataset settings
+dataset_type = "PascalVOCDataset"
+data_root = "./data/VOCdevkit/VOC2012"
+test_pipeline = [
+    dict(type="LoadImageFromFile"),
+    dict(type='ToRGB'),
+    dict(
+        type="MultiScaleFlipAug",
+        img_scale=(2048, 448),
+        flip=False,
+        transforms=[
+            dict(type="Resize", keep_ratio=True),
+            dict(type="RandomFlip"),
+            dict(type="ImageToTensorV2", keys=["img"]),
+            dict(type="Collect", keys=["img"], meta_keys=['ori_shape', 'img_shape', 'pad_shape', 'flip', 'img_info']),
+        ],
+    ),
+]
+data = dict(
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir="JPEGImages",
+        ann_dir="SegmentationClass",
+        split="ImageSets/Segmentation/val.txt",
+        pipeline=test_pipeline,
+    )
+)
+test_cfg = dict(mode="slide", stride=(224, 224), crop_size=(448, 448))

segmentation/configs/_base_/datasets/pascal_voc12_20.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# ---------------------------------------------------------------------------------------------------
+# CLIP-DINOiser
+# authors: Monika Wysoczanska, Warsaw University of Technology
+# ----------------------------------------------------------------------------------------------------
+# Modified from GroupViT (https://github.com/NVlabs/GroupViT)
+# Copyright (c) 2021-22, NVIDIA Corporation & affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------------
+_base_ = ["../custom_import.py"]
+# dataset settings
+dataset_type = "PascalVOCDataset20"
+data_root = "./data/VOCdevkit/VOC2012"
+test_pipeline = [
+    dict(type="LoadImageFromFile"),
+    dict(type='ToRGB'),
+    dict(
+        type="MultiScaleFlipAug",
+        img_scale=(2048, 448),
+        flip=False,
+        transforms=[
+            dict(type="Resize", keep_ratio=True),
+            dict(type="RandomFlip"),
+            dict(type="ImageToTensorV2", keys=["img"]),
+            dict(type="Collect", keys=["img"],
+                 meta_keys=['ori_shape', 'img_shape', 'pad_shape', 'flip', 'img_info']),
+        ],
+    ),
+]
+data = dict(
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir="JPEGImages",
+        ann_dir="SegmentationClass",
+        split="ImageSets/Segmentation/val.txt",
+        pipeline=test_pipeline,
+    )
+)
+test_cfg = dict(mode="slide", stride=(224, 224), crop_size=(448, 448))

segmentation/configs/_base_/datasets/stuff.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# ---------------------------------------------------------------------------------------------------
+# CLIP-DINOiser
+# authors: Monika Wysoczanska, Warsaw University of Technology
+# ----------------------------------------------------------------------------------------------------
+# Modified from TCL
+# Copyright (c) 2023 Kakao Brain. All Rights Reserved.
+# ------------------------------------------------------------------------------
+_base_ = ["../custom_import.py"]
+# dataset settings
+dataset_type = "COCOStuffDataset"
+data_root = "./data/coco_stuff164k"
+test_pipeline = [
+    dict(type="LoadImageFromFile"),
+    dict(type='ToRGB'),
+    dict(
+        type="MultiScaleFlipAug",
+        img_scale=(2048, 448),
+        flip=False,
+        transforms=[
+            dict(type="Resize", keep_ratio=True),
+            dict(type="RandomFlip"),
+            dict(type="ImageToTensorV2", keys=["img"]),
+            dict(type="Collect", keys=["img"], meta_keys=['ori_shape', 'img_shape', 'pad_shape', 'flip', 'img_info']),
+        ],
+    ),
+]
+data = dict(
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir="images/val2017",
+        ann_dir="annotations/val2017",
+        pipeline=test_pipeline,
+    )
+)
+test_cfg = dict(mode="slide", stride=(224, 224), crop_size=(448, 448))

segmentation/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .coco_object import *
+from .pascal_voc import *
+from .pascal_voc20 import *
+from .pascal_context import *
+from .coco_stuff import *

segmentation/datasets/coco_object.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# ---------------------------------------------------------------------------------------------------
+# CLIP-DINOiser
+# authors: Monika Wysoczanska, Warsaw University of Technology
+# ----------------------------------------------------------------------------------------------------
+# Modified from TCL
+# Copyright (c) 2023 Kakao Brain. All Rights Reserved.
+# ------------------------------------------------------------------------------
+from mmseg.datasets import DATASETS, CustomDataset
+@DATASETS.register_module()
+class COCOObjectDataset(CustomDataset):
+    """COCO-Object dataset.
+    1 bg class + first 80 classes from the COCO-Stuff dataset.
+    """
+    CLASSES = ('background', 'person', 'bicycle', 'car', 'motorcycle', 'aeroplane', 'bus', 'train', 'truck', 'boat',
+               'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
+               'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
+               'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
+               'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
+               'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut',
+               'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse',
+               'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book',
+               'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush')
+    PALETTE = [[0, 0, 0], [0, 192, 64], [0, 192, 64], [0, 64, 96], [128, 192, 192], [0, 64, 64], [0, 192, 224],
+               [0, 192, 192], [128, 192, 64], [0, 192, 96], [128, 192, 64], [128, 32, 192], [0, 0, 224], [0, 0, 64],
+               [0, 160, 192], [128, 0, 96], [128, 0, 192], [0, 32, 192], [128, 128, 224], [0, 0, 192], [128, 160, 192],
+               [128, 128, 0], [128, 0, 32], [128, 32, 0], [128, 0, 128], [64, 128, 32], [0, 160, 0], [0, 0, 0],
+               [192, 128, 160], [0, 32, 0], [0, 128, 128], [64, 128, 160], [128, 160, 0], [0, 128, 0], [192, 128, 32],
+               [128, 96, 128], [0, 0, 128], [64, 0, 32], [0, 224, 128], [128, 0, 0], [192, 0, 160], [0, 96, 128],
+               [128, 128, 128], [64, 0, 160], [128, 224, 128], [128, 128, 64], [192, 0, 32],
+               [128, 96, 0], [128, 0, 192], [0, 128, 32], [64, 224, 0], [0, 0, 64], [128, 128, 160], [64, 96, 0],
+               [0, 128, 192], [0, 128, 160], [192, 224, 0], [0, 128, 64], [128, 128, 32], [192, 32, 128], [0, 64, 192],
+               [0, 0, 32], [64, 160, 128], [128, 64, 64], [128, 0, 160], [64, 32, 128], [128, 192, 192], [0, 0, 160],
+               [192, 160, 128], [128, 192, 0], [128, 0, 96], [192, 32, 0], [128, 64, 128], [64, 128, 96], [64, 160, 0],
+               [0, 64, 0], [192, 128, 224], [64, 32, 0], [0, 192, 128], [64, 128, 224], [192, 160, 0]]
+    def __init__(self, **kwargs):
+        super(COCOObjectDataset, self).__init__(img_suffix='.jpg', seg_map_suffix='_instanceTrainIds.png', **kwargs)

segmentation/datasets/coco_stuff.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# ---------------------------------------------------------------------------------------------------
+# CLIP-DINOiser
+# authors: Monika Wysoczanska, Warsaw University of Technology
+from mmseg.datasets import DATASETS, CustomDataset
+@DATASETS.register_module(force=True)
+class COCOStuffDataset(CustomDataset):
+    """COCO-Stuff dataset.
+    In segmentation map annotation for COCO-Stuff, Train-IDs of the 10k version
+    are from 1 to 171, where 0 is the ignore index, and Train-ID of COCO Stuff
+    164k is from 0 to 170, where 255 is the ignore index. So, they are all 171
+    semantic categories. ``reduce_zero_label`` is set to True and False for the
+    10k and 164k versions, respectively. The ``img_suffix`` is fixed to '.jpg',
+    and ``seg_map_suffix`` is fixed to '.png'.
+    """
+    CLASSES = (
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+        'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy bear', 'hair drier', 'toothbrush', 'banner',
+        'blanket', 'branch', 'bridge', 'building-other', 'bush', 'cabinet',
+        'cage', 'cardboard', 'carpet', 'ceiling-other', 'ceiling-tile',
+        'cloth', 'clothes', 'clouds', 'counter', 'cupboard', 'curtain',
+        'desk-stuff', 'dirt', 'door-stuff', 'fence', 'floor-marble',
+        'floor-other', 'floor-stone', 'floor-tile', 'floor-wood',
+        'flower', 'fog', 'food-other', 'fruit', 'furniture-other', 'grass',
+        'gravel', 'ground-other', 'hill', 'house', 'leaves', 'light', 'mat',
+        'metal', 'mirror-stuff', 'moss', 'mountain', 'mud', 'napkin', 'net',
+        'paper', 'pavement', 'pillow', 'plant-other', 'plastic', 'platform',
+        'playingfield', 'railing', 'railroad', 'river', 'road', 'rock', 'roof',
+        'rug', 'salad', 'sand', 'sea', 'shelf', 'sky-other', 'skyscraper',
+        'snow', 'solid-other', 'stairs', 'stone', 'straw', 'structural-other',
+        'table', 'tent', 'textile-other', 'towel', 'tree', 'vegetable',
+        'wall-brick', 'wall-concrete', 'wall-other', 'wall-panel',
+        'wall-stone', 'wall-tile', 'wall-wood', 'water-other', 'waterdrops',
+        'window-blind', 'window-other', 'wood')
+    PALETTE = [[0, 192, 64], [0, 192, 64], [0, 64, 96], [128, 192, 192],
+               [0, 64, 64], [0, 192, 224], [0, 192, 192], [128, 192, 64],
+               [0, 192, 96], [128, 192, 64], [128, 32, 192], [0, 0, 224],
+               [0, 0, 64], [0, 160, 192], [128, 0, 96], [128, 0, 192],
+               [0, 32, 192], [128, 128, 224], [0, 0, 192], [128, 160, 192],
+               [128, 128, 0], [128, 0, 32], [128, 32, 0], [128, 0, 128],
+               [64, 128, 32], [0, 160, 0], [0, 0, 0], [192, 128, 160],
+               [0, 32, 0], [0, 128, 128], [64, 128, 160], [128, 160, 0],
+               [0, 128, 0], [192, 128, 32], [128, 96, 128], [0, 0, 128],
+               [64, 0, 32], [0, 224, 128], [128, 0, 0], [192, 0, 160],
+               [0, 96, 128], [128, 128, 128], [64, 0, 160], [128, 224, 128],
+               [128, 128, 64], [192, 0, 32], [128, 96, 0], [128, 0, 192],
+               [0, 128, 32], [64, 224, 0], [0, 0, 64], [128, 128, 160],
+               [64, 96, 0], [0, 128, 192], [0, 128, 160], [192, 224, 0],
+               [0, 128, 64], [128, 128, 32], [192, 32, 128], [0, 64, 192],
+               [0, 0, 32], [64, 160, 128], [128, 64, 64], [128, 0, 160],
+               [64, 32, 128], [128, 192, 192], [0, 0, 160], [192, 160, 128],
+               [128, 192, 0], [128, 0, 96], [192, 32, 0], [128, 64, 128],
+               [64, 128, 96], [64, 160, 0], [0, 64, 0], [192, 128, 224],
+               [64, 32, 0], [0, 192, 128], [64, 128, 224], [192, 160, 0],
+               [0, 192, 0], [192, 128, 96], [192, 96, 128], [0, 64, 128],
+               [64, 0, 96], [64, 224, 128], [128, 64, 0], [192, 0, 224],
+               [64, 96, 128], [128, 192, 128], [64, 0, 224], [192, 224, 128],
+               [128, 192, 64], [192, 0, 96], [192, 96, 0], [128, 64, 192],
+               [0, 128, 96], [0, 224, 0], [64, 64, 64], [128, 128, 224],
+               [0, 96, 0], [64, 192, 192], [0, 128, 224], [128, 224, 0],
+               [64, 192, 64], [128, 128, 96], [128, 32, 128], [64, 0, 192],
+               [0, 64, 96], [0, 160, 128], [192, 0, 64], [128, 64, 224],
+               [0, 32, 128], [192, 128, 192], [0, 64, 224], [128, 160, 128],
+               [192, 128, 0], [128, 64, 32], [128, 32, 64], [192, 0, 128],
+               [64, 192, 32], [0, 160, 64], [64, 0, 0], [192, 192, 160],
+               [0, 32, 64], [64, 128, 128], [64, 192, 160], [128, 160, 64],
+               [64, 128, 0], [192, 192, 32], [128, 96, 192], [64, 0, 128],
+               [64, 64, 32], [0, 224, 192], [192, 0, 0], [192, 64, 160],
+               [0, 96, 192], [192, 128, 128], [64, 64, 160], [128, 224, 192],
+               [192, 128, 64], [192, 64, 32], [128, 96, 64], [192, 0, 192],
+               [0, 192, 32], [64, 224, 64], [64, 0, 64], [128, 192, 160],
+               [64, 96, 64], [64, 128, 192], [0, 192, 160], [192, 224, 64],
+               [64, 128, 64], [128, 192, 32], [192, 32, 192], [64, 64, 192],
+               [0, 64, 32], [64, 160, 192], [192, 64, 64], [128, 64, 160],
+               [64, 32, 192], [192, 192, 192], [0, 64, 160], [192, 160, 192],
+               [192, 192, 0], [128, 64, 96], [192, 32, 64], [192, 64, 128],
+               [64, 192, 96], [64, 160, 64], [64, 64, 0]]
+    def __init__(self, **kwargs):
+        super(COCOStuffDataset, self).__init__(
+            img_suffix='.jpg', seg_map_suffix='_labelTrainIds.png', **kwargs)

segmentation/datasets/pascal_context.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# ---------------------------------------------------------------------------------------------------
+# CLIP-DINOiser
+# authors: Monika Wysoczanska, Warsaw University of Technology
+# ----------------------------------------------------------------------------------------------------
+# from MaskCLIP
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------------
+from mmseg.datasets import DATASETS, CustomDataset
+import os.path as osp
+@DATASETS.register_module(force=True)
+class PascalContextDataset(CustomDataset):
+    """PascalContext dataset.
+    In segmentation map annotation for PascalContext, 0 stands for background,
+    which is included in 60 categories. ``reduce_zero_label`` is fixed to
+    False. The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    fixed to '.png'.
+    Args:
+        split (str): Split txt file for PascalContext.
+    """
+    CLASSES = ('background', 'aeroplane', 'bag', 'bed', 'bedclothes', 'bench',
+               'bicycle', 'bird', 'boat', 'book', 'bottle', 'building', 'bus',
+               'cabinet', 'car', 'cat', 'ceiling', 'chair', 'cloth',
+               'computer', 'cow', 'cup', 'curtain', 'dog', 'door', 'fence',
+               'floor', 'flower', 'food', 'grass', 'ground', 'horse',
+               'keyboard', 'light', 'motorbike', 'mountain', 'mouse', 'person',
+               'plate', 'platform', 'potted plant', 'road', 'rock', 'sheep',
+               'shelves', 'sidewalk', 'sign', 'sky', 'snow', 'sofa', 'table',
+               'track', 'train', 'tree', 'truck', 'tvmonitor', 'wall', 'water',
+               'window', 'wood')
+    PALETTE = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+               [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+               [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+               [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+               [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+               [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+               [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+               [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+               [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+               [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+               [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+               [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+               [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+               [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+               [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255]]
+    def __init__(self, split, **kwargs):
+        super(PascalContextDataset, self).__init__(
+            img_suffix='.jpg',
+            seg_map_suffix='.png',
+            split=split,
+            reduce_zero_label=False,
+            **kwargs)
+        assert osp.exists(self.img_dir) and self.split is not None
+@DATASETS.register_module(force=True)
+class PascalContextDataset59(CustomDataset):
+    """PascalContext dataset.
+    In segmentation map annotation for PascalContext, 0 stands for background,
+    which is included in 60 categories. ``reduce_zero_label`` is fixed to
+    False. The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    fixed to '.png'.
+    Args:
+        split (str): Split txt file for PascalContext.
+    """
+    CLASSES = ('aeroplane', 'bag', 'bed', 'bedclothes', 'bench', 'bicycle',
+               'bird', 'boat', 'book', 'bottle', 'building', 'bus', 'cabinet',
+               'car', 'cat', 'ceiling', 'chair', 'cloth', 'computer', 'cow',
+               'cup', 'curtain', 'dog', 'door', 'fence', 'floor', 'flower',
+               'food', 'grass', 'ground', 'horse', 'keyboard', 'light',
+               'motorbike', 'mountain', 'mouse', 'person', 'plate', 'platform',
+               'potted plant', 'road', 'rock', 'sheep', 'shelves', 'sidewalk',
+               'sign', 'sky', 'snow', 'sofa', 'table', 'track', 'train',
+               'tree', 'truck', 'tvmonitor', 'wall', 'water', 'window', 'wood')
+    PALETTE = [[180, 120, 120], [6, 230, 230], [80, 50, 50], [4, 200, 3],
+               [120, 120, 80], [140, 140, 140], [204, 5, 255], [230, 230, 230],
+               [4, 250, 7], [224, 5, 255], [235, 255, 7], [150, 5, 61],
+               [120, 120, 70], [8, 255, 51], [255, 6, 82], [143, 255, 140],
+               [204, 255, 4], [255, 51, 7], [204, 70, 3], [0, 102, 200],
+               [61, 230, 250], [255, 6, 51], [11, 102, 255], [255, 7, 71],
+               [255, 9, 224], [9, 7, 230], [220, 220, 220], [255, 9, 92],
+               [112, 9, 255], [8, 255, 214], [7, 255, 224], [255, 184, 6],
+               [10, 255, 71], [255, 41, 10], [7, 255, 255], [224, 255, 8],
+               [102, 8, 255], [255, 61, 6], [255, 194, 7], [255, 122, 8],
+               [0, 255, 20], [255, 8, 41], [255, 5, 153], [6, 51, 255],
+               [235, 12, 255], [160, 150, 20], [0, 163, 255], [140, 140, 140],
+               [250, 10, 15], [20, 255, 0], [31, 255, 0], [255, 31, 0],
+               [255, 224, 0], [153, 255, 0], [0, 0, 255], [255, 71, 0],
+               [0, 235, 255], [0, 173, 255], [31, 0, 255]]
+    def __init__(self, split, **kwargs):
+        super(PascalContextDataset59, self).__init__(
+            img_suffix='.jpg',
+            seg_map_suffix='.png',
+            split=split,
+            reduce_zero_label=True,
+            **kwargs)
+        assert osp.exists(self.img_dir) and self.split is not None

segmentation/datasets/pascal_voc.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# ------------------------------------------------------------------------------
+# TCL
+# Copyright (c) 2023 Kakao Brain. All Rights Reserved.
+# ------------------------------------------------------------------------------
+# Modified from GroupViT (https://github.com/NVlabs/GroupViT)
+# Copyright (c) 2021-22, NVIDIA Corporation & affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------------
+import os
+from mmseg.datasets import DATASETS
+from mmseg.datasets import CustomDataset
+@DATASETS.register_module(force=True)
+class PascalVOCDataset(CustomDataset):
+    """Pascal VOC dataset (the background class is ignored).
+    Burrowed from MaskCLIP
+    Args:
+        split (str): Split txt file for Pascal VOC.
+    """
+    CLASSES = ('background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
+               'bus', 'car', 'cat', 'chair', 'cow', 'dining table', 'dog',
+               'horse', 'motorbike', 'person', 'potted plant', 'sheep', 'sofa',
+               'train', 'tvmonitor')
+    PALETTE = [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128],
+               [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0],
+               [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128],
+               [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0],
+               [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]]
+    def __init__(self, split, **kwargs):
+        super(PascalVOCDataset, self).__init__(
+            img_suffix='.jpg',
+            seg_map_suffix='.png',
+            split=split,
+            reduce_zero_label=False,
+            **kwargs)
+        assert os.path.exists(self.img_dir) and self.split is not None

segmentation/datasets/pascal_voc20.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# ---------------------------------------------------------------------------------------------------
+# CLIP-DINOiser
+# authors: Monika Wysoczanska, Warsaw University of Technology
+# ----------------------------------------------------------------------------------------------------
+# Modified from TCL
+# Copyright (c) 2023 Kakao Brain. All Rights Reserved.
+# ------------------------------------------------------------------------------
+import os.path as osp
+from mmseg.datasets import DATASETS
+from mmseg.datasets import CustomDataset
+@DATASETS.register_module()
+class PascalVOCDataset20(CustomDataset):
+    """Pascal VOC dataset (the background class is ignored).
+    Args:
+        split (str): Split txt file for Pascal VOC.
+    """
+    CLASSES = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
+               'bus', 'car', 'cat', 'chair', 'cow', 'dining table', 'dog',
+               'horse', 'motorbike', 'person', 'potted plant', 'sheep', 'sofa',
+               'train', 'tvmonitor')
+    PALETTE = [[128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128],
+               [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0],
+               [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128],
+               [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0],
+               [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]]
+    def __init__(self, split, **kwargs):
+        super(PascalVOCDataset20, self).__init__(
+            img_suffix='.jpg',
+            seg_map_suffix='.png',
+            split=split,
+            reduce_zero_label=True,
+            **kwargs)
+        assert osp.exists(self.img_dir) and self.split is not None

segmentation/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .builder import build_seg_dataloader, build_seg_dataset, build_seg_inference

segmentation/evaluation/builder.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# ---------------------------------------------------------------------------------------------------
+# CLIP-DINOiser
+# authors: Monika Wysoczanska, Warsaw University of Technology
+# ---------------------------------------------------------------------------------------------------
+# modified from TCL
+# Copyright (c) 2023 Kakao Brain. All Rights Reserved.
+# ---------------------------------------------------------------------------------------------------
+import mmcv
+from mmseg.datasets import build_dataloader, build_dataset
+from mmcv.utils import Registry
+from mmcv.cnn import MODELS as MMCV_MODELS
+MODELS = Registry('models', parent=MMCV_MODELS)
+SEGMENTORS = MODELS
+from .clip_dinoiser_eval import DinoCLIP_Infrencer
+def build_seg_dataset(config):
+    """Build a dataset from config."""
+    cfg = mmcv.Config.fromfile(config)
+    dataset = build_dataset(cfg.data.test)
+    return dataset
+def build_seg_dataloader(dataset, dist=True):
+    # batch size is set to 1 to handle varying image size (due to different aspect ratio)
+    if dist:
+        data_loader = build_dataloader(
+            dataset,
+            samples_per_gpu=1,
+            workers_per_gpu=2,
+            dist=dist,
+            shuffle=False,
+            persistent_workers=True,
+            pin_memory=False,
+        )
+    else:
+        data_loader = build_dataloader(
+            dataset=dataset,
+            samples_per_gpu=1,
+            workers_per_gpu=2,
+            dist=dist,
+            shuffle=False,
+            persistent_workers=True,
+            pin_memory=False,
+        )
+    return data_loader
+def build_seg_inference(
+    model,
+    dataset,
+    config,
+    seg_config,
+):
+    dset_cfg = mmcv.Config.fromfile(seg_config)  # dataset config
+    classnames = dataset.CLASSES
+    kwargs = dict()
+    if hasattr(dset_cfg, "test_cfg"):
+        kwargs["test_cfg"] = dset_cfg.test_cfg
+    seg_model = DinoCLIP_Infrencer(model, num_classes=len(classnames), **kwargs, **config.evaluate)
+    seg_model.CLASSES = dataset.CLASSES
+    seg_model.PALETTE = dataset.PALETTE
+    return seg_model

segmentation/evaluation/clip_dinoiser_eval.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# ------------------------------------------------------------------------------
+import torch
+import logging
+log = logging.getLogger(__name__)
+from mmseg.ops import resize
+from mmseg.models import EncoderDecoder
+class DinoCLIP_Infrencer(EncoderDecoder):
+    def __init__(
+        self,
+        model,
+        num_classes,
+        test_cfg=dict(),
+        **kwargs,
+    ):
+        super(EncoderDecoder, self).__init__()
+        self.mode = test_cfg['mode']
+        self.num_classes = num_classes
+        self.model = model
+        self.test_cfg = test_cfg
+        self.align_corners = False
+    @torch.no_grad()
+    def encode_decode(self, img, meta_data):
+        """
+        """
+        masks = self.model(img)
+        masks = resize(
+            input=masks,
+            size=img.shape[-2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        return masks

visualization.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import numpy as np
+def mask2rgb(mask, palette):
+    img = np.zeros((mask.shape[0], mask.shape[1], 3))
+    for l in np.unique(mask):
+        img[mask == int(l)] = palette[int(l)]
+    return img.astype(int)