Spaces:

zxhuang1698
/

ZeroShape

Build error

App Files Files Community

ZeroShape / utils /layers.py

zxhuang1698

initial commit

414b431 10 months ago

raw

history blame

4.94 kB

	import torch
	import torch.nn as nn

	from functools import partial
	from timm.models.vision_transformer import Block

	# 3D positional encoding, from https://github.com/bmild/nerf.
	class Embedder:
	def __init__(self, **kwargs):
	self.kwargs = kwargs
	self.create_embedding_fn()

	def create_embedding_fn(self):
	embed_fns = []
	d = self.kwargs['input_dims']
	out_dim = 0
	if self.kwargs['include_input']:
	embed_fns.append(lambda x: x)
	out_dim += d

	max_freq = self.kwargs['max_freq_log2']
	N_freqs = self.kwargs['num_freqs']

	if self.kwargs['log_sampling']:
	freq_bands = 2. ** torch.linspace(0., max_freq, N_freqs)
	else:
	freq_bands = torch.linspace(2.0., 2.max_freq, N_freqs)

	for freq in freq_bands:
	for p_fn in self.kwargs['periodic_fns']:
	embed_fns.append(lambda x, p_fn=p_fn,
	freq=freq: p_fn(x * freq))
	out_dim += d

	self.embed_fns = embed_fns
	self.out_dim = out_dim

	def embed(self, inputs):
	return torch.cat([fn(inputs) for fn in self.embed_fns], -1)

	def get_embedder(posenc_res, input_dims=3):
	embed_kwargs = {
	'include_input': True,
	'input_dims': input_dims,
	'max_freq_log2': posenc_res-1,
	'num_freqs': posenc_res,
	'log_sampling': True,
	'periodic_fns': [torch.sin, torch.cos],
	}

	embedder_obj = Embedder(**embed_kwargs)
	def embed(x, eo=embedder_obj): return eo.embed(x)
	return embed, embedder_obj.out_dim

	class LayerScale(nn.Module):
	def __init__(self, dim, init_values=1e-5, inplace=False):
	super().__init__()
	self.inplace = inplace
	self.gamma = nn.Parameter(init_values * torch.ones(dim))

	def forward(self, x):
	return x.mul_(self.gamma) if self.inplace else x * self.gamma

	class Bottleneck_Linear(nn.Module):
	def __init__(self, n_channels):
	super().__init__()
	self.linear1 = nn.Linear(n_channels, n_channels)
	self.norm = nn.LayerNorm(n_channels)
	self.linear2 = nn.Linear(n_channels, n_channels)
	self.gelu = nn.GELU()

	def forward(self, x):
	x = x + self.linear2(self.gelu(self.linear1(self.norm(x))))
	return x

	class Bottleneck_Conv(nn.Module):
	def __init__(self, n_channels, kernel_size=1):
	super().__init__()
	self.linear1 = nn.Conv2d(n_channels, n_channels, kernel_size=kernel_size, padding=kernel_size//2, bias=False)
	self.bn1 = nn.BatchNorm2d(n_channels)
	self.linear2 = nn.Conv2d(n_channels, n_channels, kernel_size=kernel_size, padding=kernel_size//2, bias=False)
	self.bn2 = nn.BatchNorm2d(n_channels)
	self.relu = nn.ReLU(inplace=True)

	def forward(self, x):
	assert len(x.shape) in [2, 4]
	input_dims = len(x.shape)
	if input_dims == 2:
	x = x.unsqueeze(-1).unsqueeze(-1)
	residual = x
	out = self.linear1(x)
	out = self.bn1(out)
	out = self.relu(out)
	out = self.linear2(out)
	out = self.bn2(out)
	out += residual
	out = self.relu(out)
	if input_dims == 2:
	out = out.squeeze(-1).squeeze(-1)
	return out

	class CLIPFusionBlock_Concat(nn.Module):
	"""
	Fuse clip and rgb embeddings via concat-proj
	"""
	def __init__(self, n_channels=512, n_layers=1, act=True):
	super().__init__()
	proj = [Bottleneck_Linear(2 * n_channels) for _ in range(n_layers)]
	proj.append(nn.Linear(2 * n_channels, n_channels))
	if act: proj.append(nn.GELU())
	self.proj = nn.Sequential(*proj)

	def forward(self, sem_latent, clip_latent):
	"""
	sem_latent: [B, N, C]
	clip_latent: [B, C]
	"""
	# [B, N, 2C]
	latent_concat = torch.cat([sem_latent, clip_latent.unsqueeze(1).expand_as(sem_latent)], dim=-1)
	# [B, N, C]
	latent = self.proj(latent_concat)
	return latent

	class CLIPFusionBlock_Attn(nn.Module):
	"""
	Fuse geometric and semantic embeddings via multi-layer MHA blocks
	"""
	def __init__(self, n_channels=512, n_layers=1, act=True):
	super().__init__()
	self.attn_blocks = nn.ModuleList(
	[Block(
	n_channels, 8, 4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), drop_path=0.1
	) for _ in range(n_layers)]
	)
	if act: self.attn_blocks.append(nn.GELU())

	def forward(self, sem_latent, clip_latent):
	"""
	sem_latent: [B, N, C]
	clip_latent: [B, C]
	"""
	# [B, 1+N, C], clip first
	latent = torch.cat([clip_latent.unsqueeze(1), sem_latent], dim=1)
	for attn_block in self.attn_blocks:
	latent = attn_block(latent)
	# [B, N, C]
	return latent[:, 1:, :]