Spaces:
Running
on
L40S
Running
on
L40S
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
# | |
# This work is licensed under a Creative Commons | |
# Attribution-NonCommercial-ShareAlike 4.0 International License. | |
# You should have received a copy of the license along with this | |
# work. If not, see http://creativecommons.org/licenses/by-nc-sa/4.0/ | |
"""Improved diffusion model architecture proposed in the paper | |
"Analyzing and Improving the Training Dynamics of Diffusion Models".""" | |
import numpy as np | |
import torch | |
#---------------------------------------------------------------------------- | |
# Variant of constant() that inherits dtype and device from the given | |
# reference tensor by default. | |
_constant_cache = dict() | |
def constant(value, shape=None, dtype=None, device=None, memory_format=None): | |
value = np.asarray(value) | |
if shape is not None: | |
shape = tuple(shape) | |
if dtype is None: | |
dtype = torch.get_default_dtype() | |
if device is None: | |
device = torch.device('cpu') | |
if memory_format is None: | |
memory_format = torch.contiguous_format | |
key = (value.shape, value.dtype, value.tobytes(), shape, dtype, device, memory_format) | |
tensor = _constant_cache.get(key, None) | |
if tensor is None: | |
tensor = torch.as_tensor(value.copy(), dtype=dtype, device=device) | |
if shape is not None: | |
tensor, _ = torch.broadcast_tensors(tensor, torch.empty(shape)) | |
tensor = tensor.contiguous(memory_format=memory_format) | |
_constant_cache[key] = tensor | |
return tensor | |
def const_like(ref, value, shape=None, dtype=None, device=None, memory_format=None): | |
if dtype is None: | |
dtype = ref.dtype | |
if device is None: | |
device = ref.device | |
return constant(value, shape=shape, dtype=dtype, device=device, memory_format=memory_format) | |
#---------------------------------------------------------------------------- | |
# Normalize given tensor to unit magnitude with respect to the given | |
# dimensions. Default = all dimensions except the first. | |
def normalize(x, dim=None, eps=1e-4): | |
if dim is None: | |
dim = list(range(1, x.ndim)) | |
norm = torch.linalg.vector_norm(x, dim=dim, keepdim=True, dtype=torch.float32) | |
norm = torch.add(eps, norm, alpha=np.sqrt(norm.numel() / x.numel())) | |
return x / norm.to(x.dtype) | |
class Normalize(torch.nn.Module): | |
def __init__(self, dim=None, eps=1e-4): | |
super().__init__() | |
self.dim = dim | |
self.eps = eps | |
def forward(self, x): | |
return normalize(x, dim=self.dim, eps=self.eps) | |
#---------------------------------------------------------------------------- | |
# Upsample or downsample the given tensor with the given filter, | |
# or keep it as is. | |
def resample(x, f=[1, 1], mode='keep'): | |
if mode == 'keep': | |
return x | |
f = np.float32(f) | |
assert f.ndim == 1 and len(f) % 2 == 0 | |
pad = (len(f) - 1) // 2 | |
f = f / f.sum() | |
f = np.outer(f, f)[np.newaxis, np.newaxis, :, :] | |
f = const_like(x, f) | |
c = x.shape[1] | |
if mode == 'down': | |
return torch.nn.functional.conv2d(x, | |
f.tile([c, 1, 1, 1]), | |
groups=c, | |
stride=2, | |
padding=(pad, )) | |
assert mode == 'up' | |
return torch.nn.functional.conv_transpose2d(x, (f * 4).tile([c, 1, 1, 1]), | |
groups=c, | |
stride=2, | |
padding=(pad, )) | |
#---------------------------------------------------------------------------- | |
# Magnitude-preserving SiLU (Equation 81). | |
def mp_silu(x): | |
return torch.nn.functional.silu(x) / 0.596 | |
class MPSiLU(torch.nn.Module): | |
def forward(self, x): | |
return mp_silu(x) | |
#---------------------------------------------------------------------------- | |
# Magnitude-preserving sum (Equation 88). | |
def mp_sum(a, b, t=0.5): | |
return a.lerp(b, t) / np.sqrt((1 - t)**2 + t**2) | |
#---------------------------------------------------------------------------- | |
# Magnitude-preserving concatenation (Equation 103). | |
def mp_cat(a, b, dim=1, t=0.5): | |
Na = a.shape[dim] | |
Nb = b.shape[dim] | |
C = np.sqrt((Na + Nb) / ((1 - t)**2 + t**2)) | |
wa = C / np.sqrt(Na) * (1 - t) | |
wb = C / np.sqrt(Nb) * t | |
return torch.cat([wa * a, wb * b], dim=dim) | |
#---------------------------------------------------------------------------- | |
# Magnitude-preserving convolution or fully-connected layer (Equation 47) | |
# with force weight normalization (Equation 66). | |
class MPConv1D(torch.nn.Module): | |
def __init__(self, in_channels, out_channels, kernel_size): | |
super().__init__() | |
self.out_channels = out_channels | |
self.weight = torch.nn.Parameter(torch.randn(out_channels, in_channels, kernel_size)) | |
self.weight_norm_removed = False | |
def forward(self, x, gain=1): | |
assert self.weight_norm_removed, 'call remove_weight_norm() before inference' | |
w = self.weight * gain | |
if w.ndim == 2: | |
return x @ w.t() | |
assert w.ndim == 3 | |
return torch.nn.functional.conv1d(x, w, padding=(w.shape[-1] // 2, )) | |
def remove_weight_norm(self): | |
w = self.weight.to(torch.float32) | |
w = normalize(w) # traditional weight normalization | |
w = w / np.sqrt(w[0].numel()) | |
w = w.to(self.weight.dtype) | |
self.weight.data.copy_(w) | |
self.weight_norm_removed = True | |
return self | |