Spaces:

zxhuang1698
/

ZeroShape

Build error

File size: 4,944 Bytes

414b431

import torch
import torch.nn as nn

from functools import partial
from timm.models.vision_transformer import Block

# 3D positional encoding, from https://github.com/bmild/nerf.
class Embedder:
    def __init__(self, **kwargs):
        self.kwargs = kwargs
        self.create_embedding_fn()

    def create_embedding_fn(self):
        embed_fns = []
        d = self.kwargs['input_dims']
        out_dim = 0
        if self.kwargs['include_input']:
            embed_fns.append(lambda x: x)
            out_dim += d

        max_freq = self.kwargs['max_freq_log2']
        N_freqs = self.kwargs['num_freqs']

        if self.kwargs['log_sampling']:
            freq_bands = 2. ** torch.linspace(0., max_freq, N_freqs)
        else:
            freq_bands = torch.linspace(2.**0., 2.**max_freq, N_freqs)

        for freq in freq_bands:
            for p_fn in self.kwargs['periodic_fns']:
                embed_fns.append(lambda x, p_fn=p_fn,
                                 freq=freq: p_fn(x * freq))
                out_dim += d

        self.embed_fns = embed_fns
        self.out_dim = out_dim

    def embed(self, inputs):
        return torch.cat([fn(inputs) for fn in self.embed_fns], -1)

def get_embedder(posenc_res, input_dims=3):
    embed_kwargs = {
        'include_input': True,
        'input_dims': input_dims,
        'max_freq_log2': posenc_res-1,
        'num_freqs': posenc_res,
        'log_sampling': True,
        'periodic_fns': [torch.sin, torch.cos],
    }

    embedder_obj = Embedder(**embed_kwargs)
    def embed(x, eo=embedder_obj): return eo.embed(x)
    return embed, embedder_obj.out_dim

class LayerScale(nn.Module):
    def __init__(self, dim, init_values=1e-5, inplace=False):
        super().__init__()
        self.inplace = inplace
        self.gamma = nn.Parameter(init_values * torch.ones(dim))

    def forward(self, x):
        return x.mul_(self.gamma) if self.inplace else x * self.gamma

class Bottleneck_Linear(nn.Module):
    def __init__(self, n_channels):
        super().__init__()
        self.linear1 = nn.Linear(n_channels, n_channels)
        self.norm = nn.LayerNorm(n_channels)
        self.linear2 = nn.Linear(n_channels, n_channels)
        self.gelu = nn.GELU()
        
    def forward(self, x):
        x = x + self.linear2(self.gelu(self.linear1(self.norm(x))))
        return x

class Bottleneck_Conv(nn.Module):
    def __init__(self, n_channels, kernel_size=1):
        super().__init__()
        self.linear1 = nn.Conv2d(n_channels, n_channels, kernel_size=kernel_size, padding=kernel_size//2, bias=False)
        self.bn1 = nn.BatchNorm2d(n_channels)
        self.linear2 = nn.Conv2d(n_channels, n_channels, kernel_size=kernel_size, padding=kernel_size//2, bias=False)
        self.bn2 = nn.BatchNorm2d(n_channels)
        self.relu = nn.ReLU(inplace=True)
        
    def forward(self, x):
        assert len(x.shape) in [2, 4]
        input_dims = len(x.shape)
        if input_dims == 2:
            x = x.unsqueeze(-1).unsqueeze(-1)
        residual = x
        out = self.linear1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.linear2(out)
        out = self.bn2(out)
        out += residual
        out = self.relu(out)
        if input_dims == 2:
            out = out.squeeze(-1).squeeze(-1)
        return out

class CLIPFusionBlock_Concat(nn.Module):
    """ 
    Fuse clip and rgb embeddings via concat-proj
    """
    def __init__(self, n_channels=512, n_layers=1, act=True):
        super().__init__()
        proj = [Bottleneck_Linear(2 * n_channels) for _ in range(n_layers)]
        proj.append(nn.Linear(2 * n_channels, n_channels))
        if act: proj.append(nn.GELU())
        self.proj = nn.Sequential(*proj)
    
    def forward(self, sem_latent, clip_latent):
        """ 
        sem_latent: [B, N, C]
        clip_latent: [B, C]
        """
        # [B, N, 2C]
        latent_concat = torch.cat([sem_latent, clip_latent.unsqueeze(1).expand_as(sem_latent)], dim=-1)
        # [B, N, C]
        latent = self.proj(latent_concat)
        return latent

class CLIPFusionBlock_Attn(nn.Module):
    """ 
    Fuse geometric and semantic embeddings via multi-layer MHA blocks
    """
    def __init__(self, n_channels=512, n_layers=1, act=True):
        super().__init__()
        self.attn_blocks = nn.ModuleList(
            [Block(
                n_channels, 8, 4, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), drop_path=0.1
            ) for _ in range(n_layers)]
        )
        if act: self.attn_blocks.append(nn.GELU())
    
    def forward(self, sem_latent, clip_latent):
        """ 
        sem_latent: [B, N, C]
        clip_latent: [B, C]
        """
        # [B, 1+N, C], clip first
        latent = torch.cat([clip_latent.unsqueeze(1), sem_latent], dim=1)
        for attn_block in self.attn_blocks:
            latent = attn_block(latent)
        # [B, N, C]
        return latent[:, 1:, :]