Spaces:

vobecant
/

DaS

Runtime error

App Files Files Community

vobecant commited on Mar 15, 2022

Commit

dd78229

•

1 Parent(s): 83a95c0

Initial commit

Browse files

Files changed (22) hide show

.idea/DaS.iml +8 -0
.idea/inspectionProfiles/Project_Default.xml +26 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +4 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
.idea/workspace.xml +133 -0
README.md +0 -11
app.py +151 -0
examples/img1.jpg +0 -0
requirements.txt +6 -0
segmenter_model/backbone_picie.py +348 -0
segmenter_model/blocks.py +129 -0
segmenter_model/decoder.py +214 -0
segmenter_model/factory.py +165 -0
segmenter_model/fpn_picie.py +66 -0
segmenter_model/picie_model.py +82 -0
segmenter_model/resnet_dilated.py +55 -0
segmenter_model/segmenter.py +86 -0
segmenter_model/torch.py +38 -0
segmenter_model/utils.py +582 -0
segmenter_model/vit_dino.py +348 -0

.idea/DaS.iml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="Python 3.8 (pytorch) (2)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,26 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="13">
+            <item index="0" class="java.lang.String" itemvalue="yacs" />
+            <item index="1" class="java.lang.String" itemvalue="termcolor" />
+            <item index="2" class="java.lang.String" itemvalue="pydot" />
+            <item index="3" class="java.lang.String" itemvalue="fvcore" />
+            <item index="4" class="java.lang.String" itemvalue="tabulate" />
+            <item index="5" class="java.lang.String" itemvalue="mock" />
+            <item index="6" class="java.lang.String" itemvalue="pycocotools" />
+            <item index="7" class="java.lang.String" itemvalue="prettytable" />
+            <item index="8" class="java.lang.String" itemvalue="interrogate" />
+            <item index="9" class="java.lang.String" itemvalue="cityscapesscripts" />
+            <item index="10" class="java.lang.String" itemvalue="isort" />
+            <item index="11" class="java.lang.String" itemvalue="xdoctest" />
+            <item index="12" class="java.lang.String" itemvalue="codecov" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,4 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (pytorch) (2)" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/DaS.iml" filepath="$PROJECT_DIR$/.idea/DaS.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

.idea/workspace.xml ADDED Viewed

	@@ -0,0 +1,133 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ChangeListManager">
+    <list default="true" id="5dd22f22-8223-4d55-99f9-57d1e00622d7" name="Default Changelist" comment="Initial commit.">
+      <change afterPath="$PROJECT_DIR$/app.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/examples/img1.jpg" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/requirements.txt" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/segmenter_model/backbone_picie.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/segmenter_model/blocks.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/segmenter_model/decoder.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/segmenter_model/factory.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/segmenter_model/fpn_picie.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/segmenter_model/picie_model.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/segmenter_model/resnet_dilated.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/segmenter_model/segmenter.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/segmenter_model/torch.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/segmenter_model/utils.py" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/segmenter_model/vit_dino.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
+    </list>
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="FileTemplateManagerImpl">
+    <option name="RECENT_TEMPLATES">
+      <list>
+        <option value="Python Script" />
+      </list>
+    </option>
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
+    <option name="UPDATE_TYPE" value="REBASE" />
+  </component>
+  <component name="ProjectId" id="26QLDSf8iYKDlLRah6kIg09oqIa" />
+  <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent">
+    <property name="RunOnceActivity.OpenProjectViewOnStart" value="true" />
+    <property name="RunOnceActivity.ShowReadmeOnStart" value="true" />
+    <property name="WebServerToolWindowFactoryState" value="true" />
+    <property name="last_opened_file_path" value="$PROJECT_DIR$" />
+    <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
+  </component>
+  <component name="RecentsManager">
+    <key name="CopyFile.RECENT_KEYS">
+      <recent name="$PROJECT_DIR$" />
+      <recent name="$PROJECT_DIR$/examples" />
+    </key>
+    <key name="MoveFile.RECENT_KEYS">
+      <recent name="$PROJECT_DIR$/examples" />
+    </key>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="5dd22f22-8223-4d55-99f9-57d1e00622d7" name="Default Changelist" comment="" />
+      <created>1647350746642</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1647350746642</updated>
+      <workItem from="1647350750956" duration="4327000" />
+    </task>
+    <task id="LOCAL-00001" summary="Initial commit.">
+      <created>1647352693910</created>
+      <option name="number" value="00001" />
+      <option name="presentableId" value="LOCAL-00001" />
+      <option name="project" value="LOCAL" />
+      <updated>1647352693910</updated>
+    </task>
+    <task id="LOCAL-00002" summary="Initial commit.">
+      <created>1647353059401</created>
+      <option name="number" value="00002" />
+      <option name="presentableId" value="LOCAL-00002" />
+      <option name="project" value="LOCAL" />
+      <updated>1647353059401</updated>
+    </task>
+    <task id="LOCAL-00003" summary="Added gitignore.">
+      <created>1647353514970</created>
+      <option name="number" value="00003" />
+      <option name="presentableId" value="LOCAL-00003" />
+      <option name="project" value="LOCAL" />
+      <updated>1647353514970</updated>
+    </task>
+    <task id="LOCAL-00004" summary="Added gitignore.">
+      <created>1647353622389</created>
+      <option name="number" value="00004" />
+      <option name="presentableId" value="LOCAL-00004" />
+      <option name="project" value="LOCAL" />
+      <updated>1647353622389</updated>
+    </task>
+    <task id="LOCAL-00005" summary="Added gitignore.">
+      <created>1647353674966</created>
+      <option name="number" value="00005" />
+      <option name="presentableId" value="LOCAL-00005" />
+      <option name="project" value="LOCAL" />
+      <updated>1647353674966</updated>
+    </task>
+    <task id="LOCAL-00006" summary="Initial commit.">
+      <created>1647354226094</created>
+      <option name="number" value="00006" />
+      <option name="presentableId" value="LOCAL-00006" />
+      <option name="project" value="LOCAL" />
+      <updated>1647354226094</updated>
+    </task>
+    <option name="localTasksCounter" value="7" />
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="3" />
+  </component>
+  <component name="Vcs.Log.Tabs.Properties">
+    <option name="TAB_STATES">
+      <map>
+        <entry key="MAIN">
+          <value>
+            <State />
+          </value>
+        </entry>
+      </map>
+    </option>
+  </component>
+  <component name="VcsManagerConfiguration">
+    <MESSAGE value="Added gitignore." />
+    <MESSAGE value="Initial commit." />
+    <option name="LAST_COMMIT_MESSAGE" value="Initial commit." />
+  </component>
+</project>

README.md CHANGED Viewed

@@ -1,12 +1 @@
----
-title: DaS
-emoji: 💻
-colorFrom: pink
-colorTo: pink
-sdk: gradio
-sdk_version: 2.8.10
-app_file: app.py
-pinned: false
----
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference













1	Check out the configuration reference at https://huggingface.co/docs/hub/spaces#reference

app.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import gradio as gr
+import numpy as np
+import requests
+import torch
+import yaml
+from PIL import Image
+from torchvision import transforms
+from segmenter_model import utils
+from segmenter_model.factory import create_segmenter
+from segmenter_model.fpn_picie import PanopticFPN
+from segmenter_model.utils import colorize_one, map2cs
+WEIGHTS = './weights/segmenter.pth'
+def download_file_from_google_drive(id, destination):
+    def get_confirm_token(response):
+        for key, value in response.cookies.items():
+            if key.startswith('download_warning'):
+                return value
+        return None
+    def save_response_content(response, destination):
+        CHUNK_SIZE = 32768
+        with open(destination, "wb") as f:
+            for chunk in response.iter_content(CHUNK_SIZE):
+                if chunk:  # filter out keep-alive new chunks
+                    f.write(chunk)
+    URL = "https://docs.google.com/uc?export=download"
+    session = requests.Session()
+    response = session.get(URL, params={'id': id}, stream=True)
+    token = get_confirm_token(response)
+    if token:
+        params = {'id': id, 'confirm': token}
+        response = session.get(URL, params=params, stream=True)
+    save_response_content(response, destination)
+def segment_segmenter(image, model, window_size, window_stride, encoder_features=False, decoder_features=False,
+                      no_upsample=False, batch_size=2):
+    seg_pred = utils.inference(
+        model,
+        image,
+        image.shape[-2:],
+        window_size,
+        window_stride,
+        batch_size=batch_size,
+        no_upsample=no_upsample,
+        encoder_features=encoder_features,
+        decoder_features=decoder_features
+    )
+    if not (encoder_features or decoder_features):
+        seg_pred = seg_pred.argmax(1).unsqueeze(1)
+    return seg_pred
+def remap(seg_pred, ignore=255):
+    mapping = {0: 0, 12: 1, 15: 2, 23: 3, 10: 4, 14: 5, 18: 6, 2: 7, 17: 8, 13: 9, 8: 10, 3: 11, 27: 12, 4: 13, 25: 14,
+               24: 15, 6: 16, 22: 17, 28: 18}
+    h, w = seg_pred.shape[-2:]
+    seg_pred_remap = np.ones((h, w), dtype=np.uint8) * ignore
+    for pseudo, gt in mapping.items():
+        whr = seg_pred == pseudo
+        seg_pred_remap[whr] = gt
+    return seg_pred_remap
+def create_model(resnet=False):
+    weights_path = WEIGHTS
+    variant_path = '{}_variant.yml'.format(weights_path)
+    print('Use weights {}'.format(weights_path))
+    print('Load variant from {}'.format(variant_path))
+    variant = yaml.load(
+        open(variant_path, "r"), Loader=yaml.FullLoader
+    )
+    # TODO: parse hyperparameters
+    window_size = variant['inference_kwargs']["window_size"]
+    window_stride = variant['inference_kwargs']["window_stride"]
+    dataset_kwargs = variant['dataset_kwargs']
+    net_kwargs = variant["net_kwargs"]
+    net_kwargs['n_cls'] = dataset_kwargs['nlabels']
+    dataset_kwargs = variant['dataset_kwargs']
+    net_kwargs = variant["net_kwargs"]
+    net_kwargs['n_cls'] = dataset_kwargs['nlabels']
+    if not resnet:
+        net_kwargs['decoder']['dropout'] = 0.
+    # TODO: create model
+    if resnet:
+        model = PanopticFPN(arch=net_kwargs['backbone'], pretrain=net_kwargs['pretrain'], n_cls=net_kwargs['n_cls'])
+    else:
+        model = create_segmenter(net_kwargs)
+    # TODO: load weights
+    print('Load weights from {}'.format(weights_path))
+    weights = torch.load(weights_path)['model']
+    model.load_state_dict(weights, strict=True)
+    model.eval()
+    return model, window_size, window_stride
+def get_transformations():
+    return transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
+model, window_size, window_stride = create_model()
+def predict(input_img):
+    input_img = Image.open(input_img)
+    transform = transforms.Compose([transforms.Resize(256, Image.BICUBIC), transforms.ToTensor()])
+    input_img = transform(input_img)
+    input_img = torch.unsqueeze(input_img, 0)
+    with torch.no_grad():
+        segmentation = segment_segmenter(input_img, model, window_size, window_stride).squeeze().detach()
+        segmentation_remap = remap(segmentation)
+    drawing_pseudo = colorize_one(segmentation_remap)
+    drawing_cs = map2cs(segmentation_remap)
+    drawing_pseudo = transforms.ToPILImage()(drawing_pseudo)
+    drawing_cs = transforms.ToPILImage()(drawing_cs)
+    return drawing_pseudo, drawing_cs
+title = "Drive&Segment"
+description = 'Gradio Demo accompanying paper "Drive&Segment: Unsupervised Semantic Segmentation of Urban Scenes via Cross-modal Distillation"'
+# article = "<p style='text-align: center'><a href='TODO' target='_blank'>Project Page</a> | <a href='codelink' target='_blank'>Github</a></p>"
+examples = [['examples/img1.jpg']]
+iface = gr.Interface(predict, gr.inputs.Image(type='filepath'), "image", title=title, description=description,
+                     examples=examples)
+iface.launch()

examples/img1.jpg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+torchvision
+PIL
+timm
+yaml
+einops

segmenter_model/backbone_picie.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import torch.nn as nn
+try:
+    from torchvision.models.utils import load_state_dict_from_url
+except:
+    from torch.utils.model_zoo import load_url as load_state_dict_from_url
+__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+           'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',
+           'wide_resnet50_2', 'wide_resnet101_2']
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+    'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
+    'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
+    'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
+    'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
+}
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None):
+        super(Bottleneck, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(self, block, layers, zero_init_residual=False,
+                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
+                 norm_layer=None):
+        super(ResNet, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError("replace_stride_with_dilation should be None "
+                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
+                                       dilate=replace_stride_with_dilation[0])
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
+                                       dilate=replace_stride_with_dilation[1])
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
+                                       dilate=replace_stride_with_dilation[2])
+        # self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        # self.fc = nn.Linear(512 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck):
+                    nn.init.constant_(m.bn3.weight, 0)
+                elif isinstance(m, BasicBlock):
+                    nn.init.constant_(m.bn2.weight, 0)
+    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
+                            self.base_width, previous_dilation, norm_layer))
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, groups=self.groups,
+                                base_width=self.base_width, dilation=self.dilation,
+                                norm_layer=norm_layer))
+        return nn.Sequential(*layers)
+    def _forward_impl(self, x):
+        outputs = {}
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        # outputs['stem'] = x
+        x = self.layer1(x)  # 1/4
+        outputs['res2'] = x
+        x = self.layer2(x)  # 1/8
+        outputs['res3'] = x
+        x = self.layer3(x)  # 1/16
+        outputs['res4'] = x
+        x = self.layer4(x)  # 1/32
+        outputs['res5'] = x
+        return outputs
+    def forward(self, x):
+        return self._forward_impl(x)
+def _resnet(arch, block, layers, pretrained, progress, **kwargs):
+    model = ResNet(block, layers, **kwargs)
+    if pretrained:
+        state_dict = load_state_dict_from_url(model_urls[arch],
+                                              progress=progress)
+        model.load_state_dict(state_dict, strict=False)
+    return model
+def resnet18(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-18 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
+                   **kwargs)
+def resnet34(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-34 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+def resnet50(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-50 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress,
+                   **kwargs)
+def resnet101(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-101 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress,
+                   **kwargs)
+def resnet152(pretrained=False, progress=True, **kwargs):
+    r"""ResNet-152 model from
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress,
+                   **kwargs)
+def resnext50_32x4d(pretrained=False, progress=True, **kwargs):
+    r"""ResNeXt-50 32x4d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 4
+    return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3],
+                   pretrained, progress, **kwargs)
+def resnext101_32x8d(pretrained=False, progress=True, **kwargs):
+    r"""ResNeXt-101 32x8d model from
+    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['groups'] = 32
+    kwargs['width_per_group'] = 8
+    return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3],
+                   pretrained, progress, **kwargs)
+def wide_resnet50_2(pretrained=False, progress=True, **kwargs):
+    r"""Wide ResNet-50-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3],
+                   pretrained, progress, **kwargs)
+def wide_resnet101_2(pretrained=False, progress=True, **kwargs):
+    r"""Wide ResNet-101-2 model from
+    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_
+    The model is the same as ResNet except for the bottleneck number of channels
+    which is twice larger in every block. The number of channels in outer 1x1
+    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
+    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    kwargs['width_per_group'] = 64 * 2
+    return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3],
+                   pretrained, progress, **kwargs)

segmenter_model/blocks.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""
+Adapted from 2020 Ross Wightman
+https://github.com/rwightman/pytorch-image-models
+"""
+import torch
+import torch.nn as nn
+from einops import rearrange
+from pathlib import Path
+import torch.nn.functional as F
+from timm.models.layers import DropPath
+class FeedForward(nn.Module):
+    def __init__(self, dim, hidden_dim, dropout, out_dim=None):
+        super().__init__()
+        self.fc1 = nn.Linear(dim, hidden_dim)
+        self.act = nn.GELU()
+        if out_dim is None:
+            out_dim = dim
+        self.fc2 = nn.Linear(hidden_dim, out_dim)
+        self.drop = nn.Dropout(dropout)
+    @property
+    def unwrapped(self):
+        return self
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, dim, heads, dropout):
+        super().__init__()
+        self.heads = heads
+        head_dim = dim // heads
+        self.scale = head_dim ** -0.5
+        self.attn = None
+        self.qkv = nn.Linear(dim, dim * 3)
+        self.attn_drop = nn.Dropout(dropout)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(dropout)
+    @property
+    def unwrapped(self):
+        return self
+    def forward(self, x, mask=None):
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+                .reshape(B, N, 3, self.heads, C // self.heads)
+                .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x, attn
+class AttentionQK(nn.Module):
+    def __init__(self, dim, heads=1, dropout=0.):
+        super().__init__()
+        self.heads = heads
+        head_dim = dim // heads
+        self.scale = head_dim ** -0.5
+        self.attn = None
+        self.qk = nn.Linear(dim, dim * 2)
+        self.attn_drop = nn.Dropout(dropout)
+    @property
+    def unwrapped(self):
+        return self
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = (
+            self.qk(x)
+                .reshape(B, N, 2, self.heads, C // self.heads)
+                .permute(2, 0, 3, 1, 4)
+        )
+        q, k = (
+            qkv[0],
+            qkv[1]
+        )
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        # attn = attn.sigmoid()
+        attn = attn.softmax(dim=-1)
+        return attn
+class Block(nn.Module):
+    def __init__(self, dim, heads, mlp_dim, dropout, drop_path):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.attn = Attention(dim, heads, dropout)
+        self.mlp = FeedForward(dim, mlp_dim, dropout)
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+    def forward(self, x, mask=None, return_attention=False):
+        y, attn = self.attn(self.norm1(x), mask)
+        if return_attention:
+            return attn
+        x = x + self.drop_path(y)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x

segmenter_model/decoder.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from timm.models.layers import trunc_normal_
+from segmenter_model.blocks import Block, FeedForward
+from segmenter_model.utils import init_weights
+class DecoderLinear(nn.Module):
+    def __init__(self, n_cls, patch_size, d_encoder):
+        super().__init__()
+        self.d_encoder = d_encoder
+        self.patch_size = patch_size
+        self.n_cls = n_cls
+        self.head = nn.Linear(self.d_encoder, n_cls)
+        self.apply(init_weights)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return set()
+    def forward(self, x, im_size):
+        H, W = im_size
+        GS = H // self.patch_size
+        x = self.head(x)
+        x = rearrange(x, "b (h w) c -> b c h w", h=GS)
+        return x
+class MaskTransformer(nn.Module):
+    def __init__(
+            self,
+            n_cls,
+            patch_size,
+            d_encoder,
+            n_layers,
+            n_heads,
+            d_model,
+            d_ff,
+            drop_path_rate,
+            dropout,
+    ):
+        super().__init__()
+        self.d_encoder = d_encoder
+        self.patch_size = patch_size
+        self.n_layers = n_layers
+        self.n_cls = n_cls
+        self.d_model = d_model
+        self.d_ff = d_ff
+        self.scale = d_model ** -0.5
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, n_layers)]
+        self.blocks = nn.ModuleList(
+            [Block(d_model, n_heads, d_ff, dropout, dpr[i]) for i in range(n_layers)]
+        )
+        self.cls_emb = nn.Parameter(torch.randn(1, n_cls, d_model))
+        self.proj_dec = nn.Linear(d_encoder, d_model)
+        self.proj_patch = nn.Parameter(self.scale * torch.randn(d_model, d_model))
+        self.proj_classes = nn.Parameter(self.scale * torch.randn(d_model, d_model))
+        self.decoder_norm = nn.LayerNorm(d_model)
+        self.mask_norm = nn.LayerNorm(n_cls)
+        self.apply(init_weights)
+        trunc_normal_(self.cls_emb, std=0.02)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {"cls_emb"}
+    def forward(self, x, im_size, features_only=False, no_rearrange=False):
+        H, W = im_size
+        GS = H // self.patch_size
+        # project from the encoder dimensionality to the decoder dimensionality (usually the same)
+        x = self.proj_dec(x)
+        # reshape the class embedding token
+        cls_emb = self.cls_emb.expand(x.size(0), -1, -1)
+        # concatenate the class embedding token to the input
+        x = torch.cat((x, cls_emb), 1)
+        # forward the concatenated tokens through decoder blocks
+        for blk in self.blocks:
+            x = blk(x)
+        # perform normalization
+        x = self.decoder_norm(x)
+        # split to patch features and class-segmentation features
+        patches, cls_seg_feat = x[:, : -self.n_cls], x[:, -self.n_cls:]
+        # project the patch features
+        patches = patches @ self.proj_patch
+        if features_only:
+            if not no_rearrange:
+                features = rearrange(patches, "b (h w) n -> b n h w", h=int(GS))
+            else:
+                features = patches
+            return features
+        # project the class-segmentation features
+        cls_seg_feat = cls_seg_feat @ self.proj_classes
+        # scalar product between L2-normalized patch embeddings and class embeddings -> masks
+        patches = patches / patches.norm(dim=-1, keepdim=True)
+        cls_seg_feat = cls_seg_feat / cls_seg_feat.norm(dim=-1, keepdim=True)
+        masks = patches @ cls_seg_feat.transpose(1, 2)
+        masks = self.mask_norm(masks)
+        if not no_rearrange:
+            masks = rearrange(masks, "b (h w) n -> b n h w", h=int(GS))
+        return masks
+    def get_attention_map(self, x, layer_id):
+        if layer_id >= self.n_layers or layer_id < 0:
+            raise ValueError(
+                f"Provided layer_id: {layer_id} is not valid. 0 <= {layer_id} < {self.n_layers}."
+            )
+        x = self.proj_dec(x)
+        cls_emb = self.cls_emb.expand(x.size(0), -1, -1)
+        x = torch.cat((x, cls_emb), 1)
+        for i, blk in enumerate(self.blocks):
+            if i < layer_id:
+                x = blk(x)
+            else:
+                return blk(x, return_attention=True)
+class DeepLabHead(nn.Sequential):
+    def __init__(self, in_channels, num_classes, patch_size=None):
+        super(DeepLabHead, self).__init__(
+            ASPP(in_channels, [12, 24, 36]),
+            nn.Conv2d(256, 256, 3, padding=1, bias=False),
+            nn.BatchNorm2d(256),
+            nn.ReLU(),
+            nn.Conv2d(256, num_classes, 1)
+        )
+        self.patch_size = patch_size
+    def forward(self, x, im_size=None):
+        if len(x.shape) == 3:
+            # features from ViT
+            assert im_size is not None and self.patch_size is not None
+            H, W = im_size
+            GS = H // self.patch_size
+            x = rearrange(x, "b (h w) n -> b n h w", h=int(GS)).contiguous()
+        for module in self:
+            x = module(x)
+        return x
+class ASPPConv(nn.Sequential):
+    def __init__(self, in_channels, out_channels, dilation):
+        modules = [
+            nn.Conv2d(in_channels, out_channels, 3, padding=dilation, dilation=dilation, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU()
+        ]
+        super(ASPPConv, self).__init__(*modules)
+class ASPPPooling(nn.Sequential):
+    def __init__(self, in_channels, out_channels):
+        super(ASPPPooling, self).__init__(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_channels, out_channels, 1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU())
+    def forward(self, x):
+        size = x.shape[-2:]
+        for mod in self:
+            x = mod(x)
+        return F.interpolate(x, size=size, mode='bilinear', align_corners=False)
+class ASPP(nn.Module):
+    def __init__(self, in_channels, atrous_rates, out_channels=256):
+        super(ASPP, self).__init__()
+        modules = []
+        modules.append(nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, 1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU()))
+        rates = tuple(atrous_rates)
+        for rate in rates:
+            modules.append(ASPPConv(in_channels, out_channels, rate))
+        modules.append(ASPPPooling(in_channels, out_channels))
+        self.convs = nn.ModuleList(modules)
+        self.project = nn.Sequential(
+            nn.Conv2d(5 * out_channels, out_channels, 1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(),
+            nn.Dropout(0.5))
+    def forward(self, x):
+        res = []
+        for conv in self.convs:
+            res.append(conv(x))
+        res = torch.cat(res, dim=1)
+        return self.project(res)

segmenter_model/factory.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from pathlib import Path
+import yaml
+import torch
+import math
+import os
+import torch.nn as nn
+from timm.models.helpers import load_pretrained, load_custom_pretrained
+from timm.models.vision_transformer import default_cfgs, checkpoint_filter_fn
+from timm.models.registry import register_model
+from timm.models.vision_transformer import _create_vision_transformer
+from segmenter_model.decoder import MaskTransformer
+from segmenter_model.segmenter import Segmenter
+import segmenter_model.torch as ptu
+from segmenter_model.vit_dino import vit_small, VisionTransformer
+@register_model
+def vit_base_patch8_384(pretrained=False, **kwargs):
+    """ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
+    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
+    """
+    model_kwargs = dict(patch_size=8, embed_dim=768, depth=12, num_heads=12, **kwargs)
+    model = _create_vision_transformer(
+        "vit_base_patch8_384",
+        pretrained=pretrained,
+        default_cfg=dict(
+            url="",
+            input_size=(3, 384, 384),
+            mean=(0.5, 0.5, 0.5),
+            std=(0.5, 0.5, 0.5),
+            num_classes=1000,
+        ),
+        **model_kwargs,
+    )
+    return model
+def create_vit(model_cfg):
+    model_cfg = model_cfg.copy()
+    backbone = model_cfg.pop("backbone")
+    if 'pretrained_weights' in model_cfg:
+        pretrained_weights = model_cfg.pop('pretrained_weights')
+    if 'dino' in backbone:
+        if backbone.lower() == 'dino_vits16':
+            model_cfg['drop_rate'] = model_cfg['dropout']
+            model = vit_small(**model_cfg)
+            # hard-coded for now, too lazy
+            ciirc_path = '/home/vobecant/PhD/weights/dino/dino_deitsmall16_pretrain.pth'
+            karolina_path = '/scratch/project/dd-21-20/pretrained_weights/dino/dino_deitsmall16_pretrain.pth'
+            if os.path.exists(ciirc_path):
+                pretrained_weights = ciirc_path
+            elif os.path.exists(karolina_path):
+                pretrained_weights = karolina_path
+            else:
+                raise Exception('DINO weights not found!')
+            model.load_state_dict(torch.load(pretrained_weights), strict=True)
+        else:
+            model = torch.hub.load('facebookresearch/dino:main', backbone)
+        setattr(model, 'd_model', model.num_features)
+        setattr(model, 'patch_size', model.patch_embed.patch_size)
+        setattr(model, 'distilled', False)
+        model.forward = lambda x, return_features: model.get_intermediate_layers(x, n=1)[0]
+    else:
+        normalization = model_cfg.pop("normalization")
+        model_cfg["n_cls"] = 1000
+        mlp_expansion_ratio = 4
+        model_cfg["d_ff"] = mlp_expansion_ratio * model_cfg["d_model"]
+        if backbone in default_cfgs:
+            default_cfg = default_cfgs[backbone]
+        else:
+            default_cfg = dict(
+                pretrained=False,
+                num_classes=1000,
+                drop_rate=0.0,
+                drop_path_rate=0.0,
+                drop_block_rate=None,
+            )
+        default_cfg["input_size"] = (
+            3,
+            model_cfg["image_size"][0],
+            model_cfg["image_size"][1],
+        )
+        model = VisionTransformer(**model_cfg)
+        if backbone == "vit_base_patch8_384":
+            path = os.path.expandvars("/home/vobecant/PhD/weights/vit_base_patch8_384.pth")
+            state_dict = torch.load(path, map_location="cpu")
+            filtered_dict = checkpoint_filter_fn(state_dict, model)
+            model.load_state_dict(filtered_dict, strict=True)
+        elif "deit" in backbone:
+            load_pretrained(model, default_cfg, filter_fn=checkpoint_filter_fn)
+        else:
+            load_custom_pretrained(model, default_cfg)
+    return model
+def create_decoder(encoder, decoder_cfg):
+    decoder_cfg = decoder_cfg.copy()
+    name = decoder_cfg.pop("name")
+    decoder_cfg["d_encoder"] = encoder.d_model
+    decoder_cfg["patch_size"] = encoder.patch_size
+    if "linear" in name:
+        decoder = DecoderLinear(**decoder_cfg)
+    elif name == "mask_transformer":
+        dim = encoder.d_model
+        n_heads = dim // 64
+        decoder_cfg["n_heads"] = n_heads
+        decoder_cfg["d_model"] = dim
+        decoder_cfg["d_ff"] = 4 * dim
+        decoder = MaskTransformer(**decoder_cfg)
+    elif 'deeplab' in name:
+        decoder = DeepLabHead(in_channels=encoder.d_model, num_classes=decoder_cfg["n_cls"],
+                              patch_size=decoder_cfg["patch_size"])
+    else:
+        raise ValueError(f"Unknown decoder: {name}")
+    return decoder
+def create_segmenter(model_cfg):
+    model_cfg = model_cfg.copy()
+    decoder_cfg = model_cfg.pop("decoder")
+    decoder_cfg["n_cls"] = model_cfg["n_cls"]
+    if 'weights_path' in model_cfg.keys():
+        weights_path = model_cfg.pop('weights_path')
+    else:
+        weights_path = None
+    encoder = create_vit(model_cfg)
+    decoder = create_decoder(encoder, decoder_cfg)
+    model = Segmenter(encoder, decoder, n_cls=model_cfg["n_cls"])
+    if weights_path is not None:
+        raise Exception('Wants to load weights to the complete segmenter insice create_segmenter method!')
+        state_dict = torch.load(weights_path, map_location="cpu")
+        if 'model' in state_dict:
+            state_dict = state_dict['model']
+        msg = model.load_state_dict(state_dict, strict=False)
+        print(msg)
+    return model
+def load_model(model_path, decoder_only=False, variant_path=None):
+    variant_path = Path(model_path).parent / "variant.yml" if variant_path is None else variant_path
+    with open(variant_path, "r") as f:
+        variant = yaml.load(f, Loader=yaml.FullLoader)
+    net_kwargs = variant["net_kwargs"]
+    model = create_segmenter(net_kwargs)
+    data = torch.load(model_path, map_location=ptu.device)
+    checkpoint = data["model"]
+    if decoder_only:
+        model.decoder.load_state_dict(checkpoint, strict=True)
+    else:
+        model.load_state_dict(checkpoint, strict=True)
+    return model, variant

segmenter_model/fpn_picie.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# taken from https://raw.githubusercontent.com/janghyuncho/PiCIE/1d7b034f57e98670b0d6a244b2eea11fa0dde73e/modules/fpn.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from . import backbone_picie as backbone
+class PanopticFPN(nn.Module):
+    def __init__(self, arch, pretrain, n_cls):
+        super(PanopticFPN, self).__init__()
+        self.n_cls = n_cls
+        self.backbone = backbone.__dict__[arch](pretrained=pretrain)
+        self.decoder = FPNDecoder(arch, n_cls)
+    def forward(self, x, encoder_features=False, decoder_features=False):
+        feats = self.backbone(x)
+        if decoder_features:
+            dec, outs = self.decoder(feats, get_features=decoder_features)
+        else:
+            outs = self.decoder(feats)
+        if encoder_features:
+            if decoder_features:
+                return feats['res5'], dec, outs
+            else:
+                return feats['res5'], outs
+        else:
+            return outs
+class FPNDecoder(nn.Module):
+    def __init__(self, arch, n_cls):
+        super(FPNDecoder, self).__init__()
+        self.n_cls = n_cls
+        if arch == 'resnet18':
+            mfactor = 1
+            out_dim = 128
+        else:
+            mfactor = 4
+            out_dim = 256
+        self.layer4 = nn.Conv2d(512 * mfactor // 8, out_dim, kernel_size=1, stride=1, padding=0)
+        self.layer3 = nn.Conv2d(512 * mfactor // 4, out_dim, kernel_size=1, stride=1, padding=0)
+        self.layer2 = nn.Conv2d(512 * mfactor // 2, out_dim, kernel_size=1, stride=1, padding=0)
+        self.layer1 = nn.Conv2d(512 * mfactor, out_dim, kernel_size=1, stride=1, padding=0)
+        self.pred = nn.Conv2d(out_dim, self.n_cls, 1, 1)
+    def forward(self, x, get_features=False):
+        o1 = self.layer1(x['res5'])
+        o2 = self.upsample_add(o1, self.layer2(x['res4']))
+        o3 = self.upsample_add(o2, self.layer3(x['res3']))
+        o4 = self.upsample_add(o3, self.layer4(x['res2']))
+        pred = self.pred(o4)
+        if get_features:
+            return o4, pred
+        else:
+            return pred
+    def upsample_add(self, x, y):
+        _, _, H, W = y.size()
+        return F.interpolate(x, size=(H, W), mode='bilinear', align_corners=False) + y

segmenter_model/picie_model.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from . import backbone_picie as backbone
+class PanopticFPN(nn.Module):
+    def __init__(self, args):
+        super(PanopticFPN, self).__init__()
+        self.backbone = backbone.__dict__[args.arch](pretrained=args.pretrain)
+        if args.arch == 'vit_small':
+            self.decoder = FPNDecoderViT(args)
+        else:
+            self.decoder = FPNDecoder(args)
+    def forward(self, x, encoder_features=False, decoder_features=False):
+        feats = self.backbone(x)
+        dec_outs = self.decoder(feats)
+        if encoder_features:
+            return feats['res5'], dec_outs
+        else:
+            return dec_outs
+class FPNDecoder(nn.Module):
+    def __init__(self, args):
+        super(FPNDecoder, self).__init__()
+        if args.arch == 'resnet18':
+            mfactor = 1
+            out_dim = 128
+        else:
+            mfactor = 4
+            out_dim = 256
+        self.layer4 = nn.Conv2d(512 * mfactor // 8, out_dim, kernel_size=1, stride=1, padding=0)
+        self.layer3 = nn.Conv2d(512 * mfactor // 4, out_dim, kernel_size=1, stride=1, padding=0)
+        self.layer2 = nn.Conv2d(512 * mfactor // 2, out_dim, kernel_size=1, stride=1, padding=0)
+        self.layer1 = nn.Conv2d(512 * mfactor, out_dim, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        o1 = self.layer1(x['res5'])
+        o2 = self.upsample_add(o1, self.layer2(x['res4']))
+        o3 = self.upsample_add(o2, self.layer3(x['res3']))
+        o4 = self.upsample_add(o3, self.layer4(x['res2']))
+        return o4
+    def upsample_add(self, x, y):
+        _, _, H, W = y.size()
+        return F.interpolate(x, size=(H, W), mode='bilinear', align_corners=False) + y
+class FPNDecoderViT(nn.Module):
+    def __init__(self, args):
+        super(FPNDecoderViT, self).__init__()
+        if args.arch == 'resnet18' or args.arch == 'vit_small':
+            mfactor = 1
+            out_dim = 128
+        else:
+            mfactor = 4
+            out_dim = 256
+        self.upsample_rate = 4
+        self.layer4 = nn.Conv2d(384, out_dim, kernel_size=1, stride=1, padding=0)
+        self.layer3 = nn.Conv2d(384, out_dim, kernel_size=1, stride=1, padding=0)
+        self.layer2 = nn.Conv2d(384, out_dim, kernel_size=1, stride=1, padding=0)
+        self.layer1 = nn.Conv2d(384, out_dim, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        o1 = self.layer1(x[3])
+        o1 = F.interpolate(o1, scale_factor=4, mode='bilinear', align_corners=False)
+        o2 = self.upsample_add(o1, self.layer2(x[2]))
+        o3 = self.upsample_add(o2, self.layer3(x[1]))
+        o4 = self.upsample_add(o3, self.layer4(x[0]))
+        return o4
+    def upsample_add(self, x, y):
+        return F.interpolate(y, scale_factor=self.upsample_rate, mode='bilinear', align_corners=False) + x

segmenter_model/resnet_dilated.py ADDED Viewed

	@@ -0,0 +1,55 @@

+#
+# Authors: Wouter Van Gansbeke & Simon Vandenhende
+# Licensed under the CC BY-NC 4.0 license (https://creativecommons.org/licenses/by-nc/4.0/)
+import torch.nn as nn
+class ResnetDilated(nn.Module):
+    def __init__(self, orig_resnet, dilate_scale=8):
+        super(ResnetDilated, self).__init__()
+        from functools import partial
+        if dilate_scale == 8:
+            orig_resnet.layer3.apply(
+                partial(self._nostride_dilate, dilate=2))
+            orig_resnet.layer4.apply(
+                partial(self._nostride_dilate, dilate=4))
+        elif dilate_scale == 16:
+            orig_resnet.layer4.apply(
+                partial(self._nostride_dilate, dilate=2))
+        self.conv1 = orig_resnet.conv1
+        self.bn1 = orig_resnet.bn1
+        self.relu = orig_resnet.relu
+        self.maxpool = orig_resnet.maxpool
+        self.layer1 = orig_resnet.layer1
+        self.layer2 = orig_resnet.layer2
+        self.layer3 = orig_resnet.layer3
+        self.layer4 = orig_resnet.layer4
+    def _nostride_dilate(self, m, dilate):
+        classname = m.__class__.__name__
+        if classname.find('Conv') != -1:
+            # the convolution with stride
+            if m.stride == (2, 2):
+                m.stride = (1, 1)
+                if m.kernel_size == (3, 3):
+                    m.dilation = (dilate//2, dilate//2)
+                    m.padding = (dilate//2, dilate//2)
+            # other convoluions
+            else:
+                if m.kernel_size == (3, 3):
+                    m.dilation = (dilate, dilate)
+                    m.padding = (dilate, dilate)
+    def forward(self, x):
+        x = self.relu(self.bn1(self.conv1(x)))
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        return x

segmenter_model/segmenter.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from einops import rearrange
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# from timm.models.layers import trunc_normal_
+from segmenter_model.utils import padding, unpadding
+class Segmenter(nn.Module):
+    def __init__(
+            self,
+            encoder,
+            decoder,
+            n_cls,
+    ):
+        super().__init__()
+        self.n_cls = n_cls
+        self.patch_size = encoder.patch_size
+        self.encoder = encoder
+        self.decoder = decoder
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        def append_prefix_no_weight_decay(prefix, module):
+            return set(map(lambda x: prefix + x, module.no_weight_decay()))
+        nwd_params = append_prefix_no_weight_decay("encoder.", self.encoder).union(
+            append_prefix_no_weight_decay("decoder.", self.decoder)
+        )
+        return nwd_params
+    def forward(self, im, decoder_features=False, no_upsample=False, encoder_features=False, no_rearrange=False,
+                cls_only=False, encoder_only=False):
+        H_ori, W_ori = im.size(2), im.size(3)
+        if not no_upsample:
+            im = padding(im, self.patch_size)
+        H, W = im.size(2), im.size(3)
+        x = self.encoder(im, return_features=True)  # self.patch_size times smaller than im
+        # remove CLS/DIST tokens for decoding
+        num_extra_tokens = 1 + self.encoder.distilled
+        if cls_only:
+            return x[:, 0]
+        x = x[:, num_extra_tokens:]
+        if encoder_features:
+            enc_fts = x.clone()
+            if not no_rearrange:
+                GS = H // self.patch_size
+                enc_fts = rearrange(enc_fts, "b (h w) c -> b c h w", h=GS)
+            if encoder_only:
+                return enc_fts
+        if decoder_features:
+            output = self.decoder(x, (H, W), features_only=True, no_rearrange=no_rearrange)
+            if no_rearrange:
+                if encoder_features:
+                    output = (enc_fts, output)
+                return output
+        else:
+            output = self.decoder(x, (H, W))  # shape (BS, NCLS, H/self.patch_size, W/self.patch_size)
+        if not no_upsample:
+            output = F.interpolate(output, size=(H, W), mode="bilinear")  # upsample self.patch_size times
+            output = unpadding(output, (H_ori, W_ori))
+        if encoder_features:
+            output = (enc_fts, output)
+        return output
+    def get_attention_map_enc(self, im, layer_id):
+        return self.encoder.get_attention_map(im, layer_id)
+    def get_attention_map_dec(self, im, layer_id):
+        x = self.encoder(im, return_features=True)
+        # remove CLS/DIST tokens for decoding
+        num_extra_tokens = 1 + self.encoder.distilled
+        x = x[:, num_extra_tokens:]
+        return self.decoder.get_attention_map(x, layer_id)

segmenter_model/torch.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import os
+import torch
+"""
+GPU wrappers
+"""
+use_gpu = False
+gpu_id = 0
+device = None
+distributed = False
+dist_rank = 0
+world_size = 1
+def set_gpu_mode(mode, pbs=False):
+    global use_gpu
+    global device
+    global gpu_id
+    global distributed
+    global dist_rank
+    global world_size
+    if pbs:
+        gpu_id = int(os.environ.get("MPI_LOCALRANKID", 0))
+        dist_rank = int(os.environ.get("PMI_RANK", 0))
+        world_size = int(os.environ.get("PMI_SIZE", 1))
+    else:
+        gpu_id = int(os.environ.get("SLURM_LOCALID", 0))
+        dist_rank = int(os.environ.get("SLURM_PROCID", 0))
+        world_size = int(os.environ.get("SLURM_NTASKS", 1))
+    distributed = world_size > 1
+    use_gpu = mode
+    print('gpu_id: {}, dist_rank: {}, world_size: {}, distributed: {}'.format(gpu_id, dist_rank, world_size,
+                                                                              distributed))
+    device = torch.device(f"cuda:{gpu_id}" if use_gpu else "cpu")
+    torch.backends.cudnn.benchmark = True

segmenter_model/utils.py ADDED Viewed

	@@ -0,0 +1,582 @@

+import math
+# import segm.utils.torch as ptu
+# from segm.engine import seg2rgb
+from collections import namedtuple
+import cv2
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+from timm.models.layers import trunc_normal_
+import torch
+CityscapesClass = namedtuple('CityscapesClass', ['name', 'id', 'train_id', 'category', 'category_id',
+                                                 'has_instances', 'ignore_in_eval', 'color'])
+classes = [
+    CityscapesClass('unlabeled', 0, 255, 'void', 0, False, True, (0, 0, 0)),
+    CityscapesClass('ego vehicle', 1, 255, 'void', 0, False, True, (0, 0, 0)),
+    CityscapesClass('rectification border', 2, 255, 'void', 0, False, True, (0, 0, 0)),
+    CityscapesClass('out of roi', 3, 255, 'void', 0, False, True, (0, 0, 0)),
+    CityscapesClass('static', 4, 255, 'void', 0, False, True, (0, 0, 0)),
+    CityscapesClass('dynamic', 5, 255, 'void', 0, False, True, (111, 74, 0)),
+    CityscapesClass('ground', 6, 255, 'void', 0, False, True, (81, 0, 81)),
+    CityscapesClass('road', 7, 0, 'flat', 1, False, False, (128, 64, 128)),
+    CityscapesClass('sidewalk', 8, 1, 'flat', 1, False, False, (244, 35, 232)),
+    CityscapesClass('parking', 9, 255, 'flat', 1, False, True, (250, 170, 160)),
+    CityscapesClass('rail track', 10, 255, 'flat', 1, False, True, (230, 150, 140)),
+    CityscapesClass('building', 11, 2, 'construction', 2, False, False, (70, 70, 70)),
+    CityscapesClass('wall', 12, 3, 'construction', 2, False, False, (102, 102, 156)),
+    CityscapesClass('fence', 13, 4, 'construction', 2, False, False, (190, 153, 153)),
+    CityscapesClass('guard rail', 14, 255, 'construction', 2, False, True, (180, 165, 180)),
+    CityscapesClass('bridge', 15, 255, 'construction', 2, False, True, (150, 100, 100)),
+    CityscapesClass('tunnel', 16, 255, 'construction', 2, False, True, (150, 120, 90)),
+    CityscapesClass('pole', 17, 5, 'object', 3, False, False, (153, 153, 153)),
+    CityscapesClass('polegroup', 18, 255, 'object', 3, False, True, (153, 153, 153)),
+    CityscapesClass('traffic light', 19, 6, 'object', 3, False, False, (250, 170, 30)),
+    CityscapesClass('traffic sign', 20, 7, 'object', 3, False, False, (220, 220, 0)),
+    CityscapesClass('vegetation', 21, 8, 'nature', 4, False, False, (107, 142, 35)),
+    CityscapesClass('terrain', 22, 9, 'nature', 4, False, False, (152, 251, 152)),
+    CityscapesClass('sky', 23, 10, 'sky', 5, False, False, (70, 130, 180)),
+    CityscapesClass('person', 24, 11, 'human', 6, True, False, (220, 20, 60)),
+    CityscapesClass('rider', 25, 12, 'human', 6, True, False, (255, 0, 0)),
+    CityscapesClass('car', 26, 13, 'vehicle', 7, True, False, (0, 0, 142)),
+    CityscapesClass('truck', 27, 14, 'vehicle', 7, True, False, (0, 0, 70)),
+    CityscapesClass('bus', 28, 15, 'vehicle', 7, True, False, (0, 60, 100)),
+    CityscapesClass('caravan', 29, 255, 'vehicle', 7, True, True, (0, 0, 90)),
+    CityscapesClass('trailer', 30, 255, 'vehicle', 7, True, True, (0, 0, 110)),
+    CityscapesClass('train', 31, 16, 'vehicle', 7, True, False, (0, 80, 100)),
+    CityscapesClass('motorcycle', 32, 17, 'vehicle', 7, True, False, (0, 0, 230)),
+    CityscapesClass('bicycle', 33, 18, 'vehicle', 7, True, False, (119, 11, 32)),
+    CityscapesClass('license plate', -1, -1, 'vehicle', 7, False, True, (0, 0, 142)),
+]
+cityscapes_id_to_trainID = {cls.id: cls.train_id for cls in classes}
+cityscapes_trainID_to_testID = {cls.train_id: cls.id for cls in classes}
+cityscapes_trainID_to_color = {cls.train_id: cls.color for cls in classes}
+cityscapes_trainID_to_name = {cls.train_id: cls.name for cls in classes}
+cityscapes_trainID_to_color[255] = (0, 0, 0)
+cityscapes_trainID_to_name = {cls.train_id: cls.name for cls in classes}
+cityscapes_trainID_to_name[255] = 'ignore'
+cityscapes_trainID_to_name[19] = 'ignore'
+def map2cs(seg):
+    while len(seg.shape) > 2:
+        seg = seg[0]
+    colors = cityscapes_trainID_to_color
+    # assert False, 'set ignore_idx color to black, make sure that it is not in colors'
+    rgb = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
+    for l in np.unique(seg):
+        rgb[seg == l, :] = colors[l]
+    return rgb
+def get_colors(num_colors):
+    from PIL import ImageColor
+    import matplotlib
+    hex_colors = [
+        # "#000000", # keep the black reserved
+        "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46", "#008941", "#006FA6", "#A30059",
+        "#FFDBE5", "#7A4900", "#0000A6", "#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87",
+        "#5A0007", "#809693", "#FEFFE6", "#1B4400", "#4FC601", "#3B5DFF", "#4A3B53", "#FF2F80",
+        "#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100",
+        "#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F",
+        "#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09",
+        "#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66",
+        "#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C",
+        "#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81",
+        "#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00",
+        "#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700",
+        "#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329",
+        "#5B4534", "#FDE8DC", "#404E55", "#0089A3", "#CB7E98", "#A4E804", "#324E72", "#6A3A4C",
+        "#83AB58", "#001C1E", "#D1F7CE", "#004B28", "#C8D0F6", "#A3A489", "#806C66", "#222800",
+        "#BF5650", "#E83000", "#66796D", "#DA007C", "#FF1A59", "#8ADBB4", "#1E0200", "#5B4E51",
+        "#C895C5", "#320033", "#FF6832", "#66E1D3", "#CFCDAC", "#D0AC94", "#7ED379", "#012C58",
+    ]
+    hex_colors_mlib = list(matplotlib.colors.cnames.values())
+    for hcm in hex_colors_mlib:
+        if hcm not in hex_colors:
+            hex_colors.append(hcm)
+    colors = [ImageColor.getrgb(hex) for hex in hex_colors]
+    return colors[:num_colors]
+def colorize_one(seg, ignore=None, colors=None, ncolors=32):
+    unq = np.unique(seg)
+    if ncolors is not None:
+        ncolors = max(ncolors, max(unq))
+    else:
+        ncolors = max(unq)
+    colors = get_colors(ncolors) if colors is None else colors
+    h, w = seg.shape
+    c = 3
+    rgb = np.zeros((h, w, c), dtype=np.uint8)
+    for l in unq:
+        if ignore is not None and l == ignore:
+            continue
+        try:
+            rgb[seg == l, :] = colors[l]
+        except:
+            raise Exception(l)
+    return rgb
+def init_weights(m):
+    if isinstance(m, nn.Linear):
+        trunc_normal_(m.weight, std=0.02)
+        if isinstance(m, nn.Linear) and m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.LayerNorm):
+        nn.init.constant_(m.bias, 0)
+        nn.init.constant_(m.weight, 1.0)
+def resize_pos_embed(posemb, grid_old_shape, grid_new_shape, num_extra_tokens):
+    # Rescale the grid of position embeddings when loading from state_dict. Adapted from
+    # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
+    posemb_tok, posemb_grid = (
+        posemb[:, :num_extra_tokens],
+        posemb[0, num_extra_tokens:],
+    )
+    if grid_old_shape is None:
+        gs_old_h = int(math.sqrt(len(posemb_grid)))
+        gs_old_w = gs_old_h
+    else:
+        gs_old_h, gs_old_w = grid_old_shape
+    gs_h, gs_w = grid_new_shape
+    posemb_grid = posemb_grid.reshape(1, gs_old_h, gs_old_w, -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear")
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+    return posemb
+def checkpoint_filter_fn(state_dict, model):
+    """ convert patch embedding weight from manual patchify + linear proj to conv"""
+    out_dict = {}
+    if "model" in state_dict:
+        # For deit models
+        state_dict = state_dict["model"]
+    num_extra_tokens = 1 + ("dist_token" in state_dict.keys())
+    patch_size = model.patch_size
+    image_size = model.patch_embed.image_size
+    for k, v in state_dict.items():
+        if k == "pos_embed" and v.shape != model.pos_embed.shape:
+            # To resize pos embedding when using model at different size from pretrained weights
+            v = resize_pos_embed(
+                v,
+                None,
+                (image_size[0] // patch_size, image_size[1] // patch_size),
+                num_extra_tokens,
+            )
+        out_dict[k] = v
+    return out_dict
+def padding(im, patch_size, fill_value=0):
+    # make the image sizes divisible by patch_size
+    H, W = im.size(2), im.size(3)
+    pad_h, pad_w = 0, 0
+    if H % patch_size > 0:
+        pad_h = patch_size - (H % patch_size)
+    if W % patch_size > 0:
+        pad_w = patch_size - (W % patch_size)
+    im_padded = im
+    if pad_h > 0 or pad_w > 0:
+        im_padded = F.pad(im, (0, pad_w, 0, pad_h), value=fill_value)
+    return im_padded
+def unpadding(y, target_size):
+    H, W = target_size
+    H_pad, W_pad = y.size(2), y.size(3)
+    # crop predictions on extra pixels coming from padding
+    extra_h = H_pad - H
+    extra_w = W_pad - W
+    if extra_h > 0:
+        y = y[:, :, :-extra_h]
+    if extra_w > 0:
+        y = y[:, :, :, :-extra_w]
+    return y
+def resize(im, smaller_size):
+    h, w = im.shape[2:]
+    if h < w:
+        ratio = w / h
+        h_res, w_res = smaller_size, ratio * smaller_size
+    else:
+        ratio = h / w
+        h_res, w_res = ratio * smaller_size, smaller_size
+    if min(h, w) < smaller_size:
+        im_res = F.interpolate(im, (int(h_res), int(w_res)), mode="bilinear")
+    else:
+        im_res = im
+    return im_res
+def sliding_window(im, flip, window_size, window_stride, channels_first=True):
+    if channels_first:
+        B, C, H, W = im.shape
+    else:
+        B, H, W, C = im.shape
+    ws = window_size
+    windows = {"crop": [], "anchors": []}
+    h_anchors = torch.arange(0, H, window_stride)
+    w_anchors = torch.arange(0, W, window_stride)
+    h_anchors = [h.item() for h in h_anchors if h < H - ws] + [H - ws]
+    w_anchors = [w.item() for w in w_anchors if w < W - ws] + [W - ws]
+    for ha in h_anchors:
+        for wa in w_anchors:
+            if channels_first:
+                window = im[:, :, ha: ha + ws, wa: wa + ws]
+            else:
+                window = im[:, ha: ha + ws, wa: wa + ws]
+            windows["crop"].append(window)
+            windows["anchors"].append((ha, wa))
+    windows["flip"] = flip
+    windows["shape"] = (H, W)
+    return windows
+def merge_windows(windows, window_size, ori_shape, no_softmax=False, no_upsample=False, patch_size=None):
+    ws = window_size
+    im_windows = windows["seg_maps"]
+    anchors = windows["anchors"]
+    C = im_windows[0].shape[0]
+    H, W = windows["shape"]
+    flip = windows["flip"]
+    if no_upsample:
+        H, W = H // patch_size, W // patch_size
+    logit = torch.zeros((C, H, W), device=im_windows.device)
+    count = torch.zeros((1, H, W), device=im_windows.device)
+    for window, (ha, wa) in zip(im_windows, anchors):
+        if no_upsample:
+            ha = ha // patch_size
+            wa = wa // patch_size
+        logit[:, ha: ha + ws, wa: wa + ws] += window
+        count[:, ha: ha + ws, wa: wa + ws] += 1
+    logit /= count
+    # print('Interpolate {} -> {}'.format(logit.shape, ori_shape))
+    if not no_upsample:
+        logit = F.interpolate(
+            logit.unsqueeze(0),
+            ori_shape,
+            mode="bilinear",
+        )[0]
+    if flip:
+        logit = torch.flip(logit, (2,))
+    if not no_softmax:
+        # print('Softmax in merge_windows')
+        result = F.softmax(logit, 0)
+    else:
+        # print('No softmax in merge_windows')
+        result = logit
+    return result
+def debug_windows(windows, debug_file):
+    pass
+def inference_picie(
+        model,
+        classifier,
+        metric_test,
+        ims,
+        ori_shape,
+        window_size,
+        window_stride,
+        batch_size,
+        decoder_features=False,
+        no_upsample=False,
+        debug_file=None,
+        im_rgb=None,
+        channel_first=False
+):
+    try:
+        C = model.n_cls
+    except:
+        C = classifier.module.bias.shape[0]
+    # seg_maps = []
+    # for im, im_metas in zip(ims, ims_metas):
+    for im in ims:
+        im = im.to('cuda')
+        if len(im.shape) == 3:
+            im = im.unsqueeze(0)
+        flip = False  # im_metas["flip"]
+        windows = sliding_window(im, flip, window_size, window_stride)
+        crops = torch.stack(windows.pop("crop"))[:, 0]
+        num_crops = len(crops)
+        WB = batch_size if batch_size > 0 else num_crops
+        if no_upsample:
+            window_size = window_size // model.patch_size
+        seg_maps = torch.zeros((num_crops, C, window_size, window_size), device=im.device)
+        with torch.no_grad():
+            for i in range(0, num_crops, WB):
+                # try:
+                feats = model.forward(crops[i: i + WB])
+                if metric_test == 'cosine':
+                    feats = F.normalize(feats, dim=1, p=2)
+                probs = classifier(feats)
+                probs = F.interpolate(probs, crops[i: i + WB].shape[-2:], mode='bilinear', align_corners=False)
+                seg_maps[i: i + WB] = probs
+        windows["seg_maps"] = seg_maps
+        if debug_file is not None:
+            if isinstance(im_rgb, torch.Tensor):
+                im_rgb = im_rgb.detach().cpu().numpy()
+            if len(im_rgb.shape) == 4:
+                im_rgb = im_rgb[0]
+            h, w = im.shape[-2:]
+            im_rgb = cv2.resize(im_rgb, (w, h), interpolation=cv2.INTER_LINEAR)
+            crops_rgb = np.stack(
+                sliding_window(im_rgb[None, :], flip, window_size, window_stride, channels_first=channel_first).pop(
+                    "crop"))[:, 0]
+        im_seg_map = merge_windows(windows, window_size, ori_shape, no_softmax=decoder_features,
+                                   no_upsample=no_upsample, patch_size=None)
+        seg_map = im_seg_map
+        if no_upsample and not decoder_features:
+            pass
+        else:
+            seg_map = F.interpolate(
+                seg_map.unsqueeze(0),
+                ori_shape,
+                mode="bilinear",
+            )
+    return seg_map
+def inference(
+        model,
+        ims,
+        ori_shape,
+        window_size,
+        window_stride,
+        batch_size,
+        decoder_features=False,
+        encoder_features=False,
+        save2cpu=False,
+        no_upsample=False,
+        debug_file=None,
+        im_rgb=None,
+        channel_first=False
+):
+    C = model.n_cls
+    patch_size = model.patch_size
+    # seg_maps = []
+    # for im, im_metas in zip(ims, ims_metas):
+    for im in ims:
+        im = im.to('cuda')
+        if len(im.shape) == 3:
+            im = im.unsqueeze(0)
+        # im = resize(im, window_size)
+        flip = False  # im_metas["flip"]
+        # print(im)
+        windows = sliding_window(im, flip, window_size, window_stride)
+        # print(windows)
+        crops = torch.stack(windows.pop("crop"))[:, 0]
+        num_crops = len(crops)
+        WB = batch_size if batch_size > 0 else num_crops
+        if no_upsample:
+            window_size = window_size // model.patch_size
+            # print('Change variable window_size to {}'.format(window_size))
+        seg_maps = torch.zeros((num_crops, C, window_size, window_size), device=im.device)
+        # print('Allocated segm_maps:  {}, device: {}'.format(seg_maps.shape, seg_maps.device))
+        with torch.no_grad():
+            for i in range(0, num_crops, WB):
+                # try:
+                seg_maps[i: i + WB] = model.forward(crops[i: i + WB], decoder_features=decoder_features,
+                                                    encoder_features=encoder_features,
+                                                    no_upsample=no_upsample)
+                # except:
+                #     print('Input of shape: {}'.format(crops[i:i + WB].shape))
+                #     assert False, "End after error."
+                # torch.cuda.empty_cache()
+        windows["seg_maps"] = seg_maps
+        if debug_file is not None:
+            if isinstance(im_rgb, torch.Tensor):
+                im_rgb = im_rgb.detach().cpu().numpy()
+            if len(im_rgb.shape) == 4:
+                im_rgb = im_rgb[0]
+            h, w = im.shape[-2:]
+            im_rgb = cv2.resize(im_rgb, (w, h), interpolation=cv2.INTER_LINEAR)
+            crops_rgb = np.stack(
+                sliding_window(im_rgb[None, :], flip, window_size, window_stride, channels_first=channel_first).pop(
+                    "crop"))[:, 0]
+            windows_row = np.concatenate([w for w in crops_rgb], axis=1)
+            # print(windows_row)
+            try:
+                Image.fromarray(windows_row).save(debug_file)
+            except:
+                pass
+            suffix = debug_file[-4:]
+            debug_file = debug_file.replace(suffix, '_preds{}'.format(suffix))
+            windows_preds = seg_maps.argmax(dim=1).cpu().numpy()
+            windows_preds_row = np.concatenate([seg2rgb(wp, C, 255) for wp in windows_preds], axis=1)
+            windows_row_plus_preds = np.concatenate((windows_row, windows_preds_row), axis=0)
+            try:
+                Image.fromarray(windows_preds_row).save(debug_file)
+            except:
+                pass
+            debug_file = debug_file.replace(suffix, '_wImg{}'.format(suffix))
+            try:
+                Image.fromarray(windows_row_plus_preds).save(debug_file)
+            except:
+                pass
+        im_seg_map = merge_windows(windows, window_size, ori_shape, no_softmax=decoder_features,
+                                   no_upsample=no_upsample, patch_size=model.patch_size)
+        seg_map = im_seg_map
+        if no_upsample and not decoder_features:
+            pass
+        else:
+            seg_map = F.interpolate(
+                seg_map.unsqueeze(0),
+                ori_shape,
+                mode="bilinear",
+            )
+        # seg_maps.append(seg_map)
+        # print('Done one inference.')
+    # seg_maps = torch.cat(seg_maps, dim=0)
+    return seg_map
+def inference_features(
+        model,
+        ims,
+        ori_shape,
+        window_size,
+        window_stride,
+        batch_size,
+        decoder_features=False,
+        encoder_features=False,
+        save2cpu=False,
+        no_upsample=True,
+        encoder_only=False
+):
+    C = model.n_cls if decoder_features else model.encoder.d_model
+    patch_size = model.patch_size
+    # seg_maps = []
+    # for im, im_metas in zip(ims, ims_metas):
+    for im in ims:
+        im = im.to('cuda')
+        if len(im.shape) == 3:
+            im = im.unsqueeze(0)
+        # im = resize(im, window_size)
+        flip = False  # im_metas["flip"]
+        # print(im)
+        windows = sliding_window(im, flip, window_size, window_stride)
+        # print(windows)
+        crops = torch.stack(windows.pop("crop"))[:, 0]
+        num_crops = len(crops)
+        WB = batch_size if batch_size > 0 else num_crops
+        if no_upsample:
+            window_size = window_size // model.patch_size
+            # print('Change variable window_size to {}'.format(window_size))
+        enc_maps = torch.zeros((num_crops, C, window_size, window_size), device=im.device)
+        if decoder_features:
+            dec_maps = torch.zeros((num_crops, C, window_size, window_size), device=im.device)
+        # print('Allocated segm_maps:  {}, device: {}'.format(seg_maps.shape, seg_maps.device))
+        with torch.no_grad():
+            for i in range(0, num_crops, WB):
+                enc_fts = model.forward(crops[i: i + WB], decoder_features=decoder_features,
+                                        encoder_features=True,
+                                        no_upsample=no_upsample, encoder_only=encoder_only)
+                if decoder_features:
+                    enc_fts, dec_fts = enc_fts
+                    dec_maps[i: i + WB] = dec_fts
+                elif isinstance(enc_fts, tuple):
+                    enc_fts = enc_fts[0]
+                enc_maps[i: i + WB] = enc_fts
+        windows["seg_maps"] = enc_maps
+        im_enc_map = merge_windows(windows, window_size, ori_shape, no_softmax=decoder_features,
+                                   no_upsample=no_upsample, patch_size=model.patch_size)
+        if decoder_features:
+            windows["seg_maps"] = dec_maps
+            im_dec_map = merge_windows(windows, window_size, ori_shape, no_softmax=decoder_features,
+                                       no_upsample=no_upsample, patch_size=model.patch_size)
+        if no_upsample:
+            pass
+        else:
+            im_enc_map = F.interpolate(
+                im_enc_map.unsqueeze(0),
+                ori_shape,
+                mode="bilinear",
+            )
+            if decoder_features:
+                im_dec_map = F.interpolate(
+                    im_dec_map.unsqueeze(0),
+                    ori_shape,
+                    mode="bilinear",
+                )
+    im_enc_map = im_enc_map.cpu().numpy()
+    if decoder_features:
+        im_dec_map = im_dec_map.cpu().numpy()
+        return im_enc_map, im_dec_map
+    return im_enc_map
+def inference_conv(
+        model,
+        ims,
+        ims_metas,
+        ori_shape
+):
+    assert len(ims) == 1
+    for im, im_metas in zip(ims, ims_metas):
+        im = im.to(ptu.device)
+        if len(im.shape) < 4:
+            im = im.unsqueeze(0)
+        logits = model(im)
+        if ori_shape[:2] != logits.shape[-2:]:
+            # resize
+            logits = F.interpolate(
+                logits,
+                ori_shape[-2:],
+                mode="bilinear",
+            )
+        # 3) applies softmax
+        result = F.softmax(logits.squeeze(), 0)
+    # print(result.shape)
+    return result
+def num_params(model):
+    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
+    n_params = sum([torch.prod(torch.tensor(p.size())) for p in model_parameters])
+    if not type(n_params) == int:
+        n_params = n_params.item()
+    return n_params

segmenter_model/vit_dino.py ADDED Viewed

	@@ -0,0 +1,348 @@

+# Copied from DINO
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Mostly copy-paste from timm library.
+https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+"""
+import math
+import warnings
+from functools import partial
+import torch
+import torch.nn as nn
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x, attn
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+    def forward(self, x, return_attention=False):
+        y, attn = self.attn(self.norm1(x))
+        if return_attention:
+            return attn
+        x = x + self.drop_path(y)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        num_patches = (img_size // patch_size) * (img_size // patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class VisionTransformer(nn.Module):
+    """ Vision Transformer """
+    def __init__(self, img_size=[224], patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., norm_layer=nn.LayerNorm, **kwargs):
+        super().__init__()
+        self.num_features = self.embed_dim = embed_dim
+        self.patch_embed = PatchEmbed(
+            img_size=img_size[0], patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer)
+            for i in range(depth)])
+        self.norm = norm_layer(embed_dim)
+        # Classifier head
+        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def interpolate_pos_encoding(self, x, w, h):
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        class_pos_embed = self.pos_embed[:, 0]
+        patch_pos_embed = self.pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_embed.patch_size
+        h0 = h // self.patch_embed.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        w0, h0 = w0 + 0.1, h0 + 0.1
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
+            scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
+            mode='bicubic',
+        )
+        assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+    def prepare_tokens(self, x):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)  # patch linear embedding
+        # add the [CLS] token to the embed patch tokens
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        # add positional encoding to each token
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        return self.pos_drop(x)
+    def forward(self, x):
+        x = self.prepare_tokens(x)
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+        return x[:, 0]
+    def get_last_selfattention(self, x):
+        x = self.prepare_tokens(x)
+        for i, blk in enumerate(self.blocks):
+            if i < len(self.blocks) - 1:
+                x = blk(x)
+            else:
+                # return attention of the last block
+                return blk(x, return_attention=True)
+    def get_n_last_selfattentions(self, x, layers_from_end=(1)):
+        x = self.prepare_tokens(x)
+        attentions = []
+        for i, blk in enumerate(self.blocks):
+            num_from_end = len(self.blocks) - i
+            if num_from_end in layers_from_end:
+                # get attention of the block
+                attn = blk(x, return_attention=True)
+                attentions.append(attn)
+            x = blk(x)
+        return attentions
+    def get_intermediate_layers(self, x, n=1):
+        x = self.prepare_tokens(x)
+        # we return the output tokens from the `n` last blocks
+        output = []
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if len(self.blocks) - i <= n:
+                output.append(self.norm(x))
+        return output
+def vit_tiny(patch_size=16, **kwargs):
+    model = VisionTransformer(
+        patch_size=patch_size, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4,
+        qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+def vit_small(patch_size=16, **kwargs):
+    model = VisionTransformer(
+        patch_size=patch_size, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4,
+        qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+def vit_base(patch_size=16, **kwargs):
+    model = VisionTransformer(
+        patch_size=patch_size, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4,
+        qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs)
+    return model
+class DINOHead(nn.Module):
+    def __init__(self, in_dim, out_dim, use_bn=False, norm_last_layer=True, nlayers=3, hidden_dim=2048,
+                 bottleneck_dim=256):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        if nlayers == 1:
+            self.mlp = nn.Linear(in_dim, bottleneck_dim)
+        else:
+            layers = [nn.Linear(in_dim, hidden_dim)]
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+            for _ in range(nlayers - 2):
+                layers.append(nn.Linear(hidden_dim, hidden_dim))
+                if use_bn:
+                    layers.append(nn.BatchNorm1d(hidden_dim))
+                layers.append(nn.GELU())
+            layers.append(nn.Linear(hidden_dim, bottleneck_dim))
+            self.mlp = nn.Sequential(*layers)
+        self.apply(self._init_weights)
+        self.last_layer = nn.utils.weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
+        self.last_layer.weight_g.data.fill_(1)
+        if norm_last_layer:
+            self.last_layer.weight_g.requires_grad = False
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        x = self.mlp(x)
+        x = nn.functional.normalize(x, dim=-1, p=2)
+        x = self.last_layer(x)
+        return x