Spaces:

Realcat
/

image-matching-webui

Running

App Files Files Community

image-matching-webui / third_party /Roma /roma /models /encoders.py

Vincentqyw

fix: roma cpu

f517bbf 12 months ago

raw

history blame

No virus

5.34 kB

	from typing import Optional, Union
	import torch
	from torch import device
	import torch.nn as nn
	import torch.nn.functional as F
	import torchvision.models as tvm
	import gc

	device = "cuda" if torch.cuda.is_available() else "cpu"


	class ResNet50(nn.Module):
	def __init__(
	self,
	pretrained=False,
	high_res=False,
	weights=None,
	dilation=None,
	freeze_bn=True,
	anti_aliased=False,
	early_exit=False,
	amp=False,
	) -> None:
	super().__init__()
	if dilation is None:
	dilation = [False, False, False]
	if anti_aliased:
	pass
	else:
	if weights is not None:
	self.net = tvm.resnet50(
	weights=weights, replace_stride_with_dilation=dilation
	)
	else:
	self.net = tvm.resnet50(
	pretrained=pretrained, replace_stride_with_dilation=dilation
	)

	self.high_res = high_res
	self.freeze_bn = freeze_bn
	self.early_exit = early_exit
	self.amp = amp
	if torch.cuda.is_available():
	if torch.cuda.is_bf16_supported():
	self.amp_dtype = torch.bfloat16
	else:
	self.amp_dtype = torch.float16
	else:
	self.amp_dtype = torch.float32

	def forward(self, x, **kwargs):
	with torch.autocast(device, enabled=self.amp, dtype=self.amp_dtype):
	net = self.net
	feats = {1: x}
	x = net.conv1(x)
	x = net.bn1(x)
	x = net.relu(x)
	feats[2] = x
	x = net.maxpool(x)
	x = net.layer1(x)
	feats[4] = x
	x = net.layer2(x)
	feats[8] = x
	if self.early_exit:
	return feats
	x = net.layer3(x)
	feats[16] = x
	x = net.layer4(x)
	feats[32] = x
	return feats

	def train(self, mode=True):
	super().train(mode)
	if self.freeze_bn:
	for m in self.modules():
	if isinstance(m, nn.BatchNorm2d):
	m.eval()
	pass


	class VGG19(nn.Module):
	def __init__(self, pretrained=False, amp=False) -> None:
	super().__init__()
	self.layers = nn.ModuleList(tvm.vgg19_bn(pretrained=pretrained).features[:40])
	self.amp = amp
	if torch.cuda.is_available():
	if torch.cuda.is_bf16_supported():
	self.amp_dtype = torch.bfloat16
	else:
	self.amp_dtype = torch.float16
	else:
	self.amp_dtype = torch.float32

	def forward(self, x, **kwargs):
	with torch.autocast(device, enabled=self.amp, dtype=self.amp_dtype):
	feats = {}
	scale = 1
	for layer in self.layers:
	if isinstance(layer, nn.MaxPool2d):
	feats[scale] = x
	scale = scale * 2
	x = layer(x)
	return feats


	class CNNandDinov2(nn.Module):
	def __init__(self, cnn_kwargs=None, amp=False, use_vgg=False, dinov2_weights=None):
	super().__init__()
	if dinov2_weights is None:
	dinov2_weights = torch.hub.load_state_dict_from_url(
	"https://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_pretrain.pth",
	map_location="cpu",
	)
	from .transformer import vit_large

	vit_kwargs = dict(
	img_size=518,
	patch_size=14,
	init_values=1.0,
	ffn_layer="mlp",
	block_chunks=0,
	)

	dinov2_vitl14 = vit_large(**vit_kwargs).eval()
	dinov2_vitl14.load_state_dict(dinov2_weights)
	cnn_kwargs = cnn_kwargs if cnn_kwargs is not None else {}
	if not use_vgg:
	self.cnn = ResNet50(**cnn_kwargs)
	else:
	self.cnn = VGG19(**cnn_kwargs)
	self.amp = amp
	if torch.cuda.is_available():
	if torch.cuda.is_bf16_supported():
	self.amp_dtype = torch.bfloat16
	else:
	self.amp_dtype = torch.float16
	else:
	self.amp_dtype = torch.float32
	if self.amp:
	dinov2_vitl14 = dinov2_vitl14.to(self.amp_dtype)
	self.dinov2_vitl14 = [dinov2_vitl14] # ugly hack to not show parameters to DDP

	def train(self, mode: bool = True):
	return self.cnn.train(mode)

	def forward(self, x, upsample=False):
	B, C, H, W = x.shape
	feature_pyramid = self.cnn(x)

	if not upsample:
	with torch.no_grad():
	if self.dinov2_vitl14[0].device != x.device:
	self.dinov2_vitl14[0] = (
	self.dinov2_vitl14[0].to(x.device).to(self.amp_dtype)
	)
	dinov2_features_16 = self.dinov2_vitl14[0].forward_features(
	x.to(self.amp_dtype)
	)
	features_16 = (
	dinov2_features_16["x_norm_patchtokens"]
	.permute(0, 2, 1)
	.reshape(B, 1024, H // 14, W // 14)
	)
	del dinov2_features_16
	feature_pyramid[16] = features_16
	return feature_pyramid