Spaces:

Realcat
/

image-matching-webui

Running

File size: 5,344 Bytes

from typing import Optional, Union
import torch
from torch import device
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as tvm
import gc

device = "cuda" if torch.cuda.is_available() else "cpu"


class ResNet50(nn.Module):
    def __init__(
        self,
        pretrained=False,
        high_res=False,
        weights=None,
        dilation=None,
        freeze_bn=True,
        anti_aliased=False,
        early_exit=False,
        amp=False,
    ) -> None:
        super().__init__()
        if dilation is None:
            dilation = [False, False, False]
        if anti_aliased:
            pass
        else:
            if weights is not None:
                self.net = tvm.resnet50(
                    weights=weights, replace_stride_with_dilation=dilation
                )
            else:
                self.net = tvm.resnet50(
                    pretrained=pretrained, replace_stride_with_dilation=dilation
                )

        self.high_res = high_res
        self.freeze_bn = freeze_bn
        self.early_exit = early_exit
        self.amp = amp
        if torch.cuda.is_available():
            if torch.cuda.is_bf16_supported():
                self.amp_dtype = torch.bfloat16
            else:
                self.amp_dtype = torch.float16
        else:
            self.amp_dtype = torch.float32

    def forward(self, x, **kwargs):
        with torch.autocast(device, enabled=self.amp, dtype=self.amp_dtype):
            net = self.net
            feats = {1: x}
            x = net.conv1(x)
            x = net.bn1(x)
            x = net.relu(x)
            feats[2] = x
            x = net.maxpool(x)
            x = net.layer1(x)
            feats[4] = x
            x = net.layer2(x)
            feats[8] = x
            if self.early_exit:
                return feats
            x = net.layer3(x)
            feats[16] = x
            x = net.layer4(x)
            feats[32] = x
            return feats

    def train(self, mode=True):
        super().train(mode)
        if self.freeze_bn:
            for m in self.modules():
                if isinstance(m, nn.BatchNorm2d):
                    m.eval()
                pass


class VGG19(nn.Module):
    def __init__(self, pretrained=False, amp=False) -> None:
        super().__init__()
        self.layers = nn.ModuleList(tvm.vgg19_bn(pretrained=pretrained).features[:40])
        self.amp = amp
        if torch.cuda.is_available():
            if torch.cuda.is_bf16_supported():
                self.amp_dtype = torch.bfloat16
            else:
                self.amp_dtype = torch.float16
        else:
            self.amp_dtype = torch.float32

    def forward(self, x, **kwargs):
        with torch.autocast(device, enabled=self.amp, dtype=self.amp_dtype):
            feats = {}
            scale = 1
            for layer in self.layers:
                if isinstance(layer, nn.MaxPool2d):
                    feats[scale] = x
                    scale = scale * 2
                x = layer(x)
            return feats


class CNNandDinov2(nn.Module):
    def __init__(self, cnn_kwargs=None, amp=False, use_vgg=False, dinov2_weights=None):
        super().__init__()
        if dinov2_weights is None:
            dinov2_weights = torch.hub.load_state_dict_from_url(
                "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_pretrain.pth",
                map_location="cpu",
            )
        from .transformer import vit_large

        vit_kwargs = dict(
            img_size=518,
            patch_size=14,
            init_values=1.0,
            ffn_layer="mlp",
            block_chunks=0,
        )

        dinov2_vitl14 = vit_large(**vit_kwargs).eval()
        dinov2_vitl14.load_state_dict(dinov2_weights)
        cnn_kwargs = cnn_kwargs if cnn_kwargs is not None else {}
        if not use_vgg:
            self.cnn = ResNet50(**cnn_kwargs)
        else:
            self.cnn = VGG19(**cnn_kwargs)
        self.amp = amp
        if torch.cuda.is_available():
            if torch.cuda.is_bf16_supported():
                self.amp_dtype = torch.bfloat16
            else:
                self.amp_dtype = torch.float16
        else:
            self.amp_dtype = torch.float32
        if self.amp:
            dinov2_vitl14 = dinov2_vitl14.to(self.amp_dtype)
        self.dinov2_vitl14 = [dinov2_vitl14]  # ugly hack to not show parameters to DDP

    def train(self, mode: bool = True):
        return self.cnn.train(mode)

    def forward(self, x, upsample=False):
        B, C, H, W = x.shape
        feature_pyramid = self.cnn(x)

        if not upsample:
            with torch.no_grad():
                if self.dinov2_vitl14[0].device != x.device:
                    self.dinov2_vitl14[0] = (
                        self.dinov2_vitl14[0].to(x.device).to(self.amp_dtype)
                    )
                dinov2_features_16 = self.dinov2_vitl14[0].forward_features(
                    x.to(self.amp_dtype)
                )
                features_16 = (
                    dinov2_features_16["x_norm_patchtokens"]
                    .permute(0, 2, 1)
                    .reshape(B, 1024, H // 14, W // 14)
                )
                del dinov2_features_16
                feature_pyramid[16] = features_16
        return feature_pyramid