MonsterForge-medium / deploy.py
michaelriedl's picture
Initial dump
8f71eda
import math
import torch
import torch.nn.functional as F
from math import log2
from torch import nn, einsum
from kornia.filters import filter2d
from einops import reduce, rearrange, repeat
def exists(val):
return val is not None
def is_power_of_two(val):
return log2(val).is_integer()
def default(val, d):
return val if exists(val) else d
def get_1d_dct(i, freq, L):
result = math.cos(math.pi * freq * (i + 0.5) / L) / math.sqrt(L)
return result * (1 if freq == 0 else math.sqrt(2))
def get_dct_weights(width, channel, fidx_u, fidx_v):
dct_weights = torch.zeros(1, channel, width, width)
c_part = channel // len(fidx_u)
for i, (u_x, v_y) in enumerate(zip(fidx_u, fidx_v)):
for x in range(width):
for y in range(width):
coor_value = get_1d_dct(x, u_x, width) * get_1d_dct(y, v_y, width)
dct_weights[:, i * c_part : (i + 1) * c_part, x, y] = coor_value
return dct_weights
class Blur(nn.Module):
def __init__(self):
super().__init__()
f = torch.Tensor([1, 2, 1])
self.register_buffer("f", f)
def forward(self, x):
f = self.f
f = f[None, None, :] * f[None, :, None]
return filter2d(x, f, normalized=True)
class ChanNorm(nn.Module):
def __init__(self, dim, eps=1e-5):
super().__init__()
self.eps = eps
self.g = nn.Parameter(torch.ones(1, dim, 1, 1))
self.b = nn.Parameter(torch.zeros(1, dim, 1, 1))
def forward(self, x):
var = torch.var(x, dim=1, unbiased=False, keepdim=True)
mean = torch.mean(x, dim=1, keepdim=True)
return (x - mean) / (var + self.eps).sqrt() * self.g + self.b
def Conv2dSame(dim_in, dim_out, kernel_size, bias=True):
pad_left = kernel_size // 2
pad_right = (pad_left - 1) if (kernel_size % 2) == 0 else pad_left
return nn.Sequential(
nn.ZeroPad2d((pad_left, pad_right, pad_left, pad_right)),
nn.Conv2d(dim_in, dim_out, kernel_size, bias=bias),
)
class DepthWiseConv2d(nn.Module):
def __init__(self, dim_in, dim_out, kernel_size, padding=0, stride=1, bias=True):
super().__init__()
self.net = nn.Sequential(
nn.Conv2d(
dim_in,
dim_in,
kernel_size=kernel_size,
padding=padding,
groups=dim_in,
stride=stride,
bias=bias,
),
nn.Conv2d(dim_in, dim_out, kernel_size=1, bias=bias),
)
def forward(self, x):
return self.net(x)
class FCANet(nn.Module):
def __init__(self, *, chan_in, chan_out, reduction=4, width):
super().__init__()
freq_w, freq_h = ([0] * 8), list(
range(8)
) # in paper, it seems 16 frequencies was ideal
dct_weights = get_dct_weights(
width, chan_in, [*freq_w, *freq_h], [*freq_h, *freq_w]
)
self.register_buffer("dct_weights", dct_weights)
chan_intermediate = max(3, chan_out // reduction)
self.net = nn.Sequential(
nn.Conv2d(chan_in, chan_intermediate, 1),
nn.LeakyReLU(0.1),
nn.Conv2d(chan_intermediate, chan_out, 1),
nn.Sigmoid(),
)
def forward(self, x):
x = reduce(
x * self.dct_weights, "b c (h h1) (w w1) -> b c h1 w1", "sum", h1=1, w1=1
)
return self.net(x)
class Generator(nn.Module):
def __init__(
self,
*,
image_size,
latent_dim=256,
fmap_max=512,
fmap_inverse_coef=12,
transparent=False,
greyscale=False,
attn_res_layers=[],
freq_chan_attn=False,
syncbatchnorm=False,
antialias=False,
):
super().__init__()
resolution = log2(image_size)
assert is_power_of_two(image_size), "image size must be a power of 2"
# Set the normalization and blur
norm_class = nn.SyncBatchNorm if syncbatchnorm else nn.BatchNorm2d
Blur = nn.Identity if not antialias else Blur
if transparent:
init_channel = 4
elif greyscale:
init_channel = 1
else:
init_channel = 3
self.latent_dim = latent_dim
fmap_max = default(fmap_max, latent_dim)
self.initial_conv = nn.Sequential(
nn.ConvTranspose2d(latent_dim, latent_dim * 2, 4),
norm_class(latent_dim * 2),
nn.GLU(dim=1),
)
num_layers = int(resolution) - 2
features = list(
map(lambda n: (n, 2 ** (fmap_inverse_coef - n)), range(2, num_layers + 2))
)
features = list(map(lambda n: (n[0], min(n[1], fmap_max)), features))
features = list(map(lambda n: 3 if n[0] >= 8 else n[1], features))
features = [latent_dim, *features]
in_out_features = list(zip(features[:-1], features[1:]))
self.res_layers = range(2, num_layers + 2)
self.layers = nn.ModuleList([])
self.res_to_feature_map = dict(zip(self.res_layers, in_out_features))
self.sle_map = ((3, 7), (4, 8), (5, 9), (6, 10))
self.sle_map = list(
filter(lambda t: t[0] <= resolution and t[1] <= resolution, self.sle_map)
)
self.sle_map = dict(self.sle_map)
self.num_layers_spatial_res = 1
for res, (chan_in, chan_out) in zip(self.res_layers, in_out_features):
image_width = 2**res
attn = None
if image_width in attn_res_layers:
attn = PreNorm(chan_in, LinearAttention(chan_in))
sle = None
if res in self.sle_map:
residual_layer = self.sle_map[res]
sle_chan_out = self.res_to_feature_map[residual_layer - 1][-1]
if freq_chan_attn:
sle = FCANet(
chan_in=chan_out, chan_out=sle_chan_out, width=2 ** (res + 1)
)
else:
sle = GlobalContext(chan_in=chan_out, chan_out=sle_chan_out)
layer = nn.ModuleList(
[
nn.Sequential(
PixelShuffleUpsample(chan_in),
Blur(),
Conv2dSame(chan_in, chan_out * 2, 4),
Noise(),
norm_class(chan_out * 2),
nn.GLU(dim=1),
),
sle,
attn,
]
)
self.layers.append(layer)
self.out_conv = nn.Conv2d(features[-1], init_channel, 3, padding=1)
def forward(self, x):
x = rearrange(x, "b c -> b c () ()")
x = self.initial_conv(x)
x = F.normalize(x, dim=1)
residuals = dict()
for res, (up, sle, attn) in zip(self.res_layers, self.layers):
if exists(attn):
x = attn(x) + x
x = up(x)
if exists(sle):
out_res = self.sle_map[res]
residual = sle(x)
residuals[out_res] = residual
next_res = res + 1
if next_res in residuals:
x = x * residuals[next_res]
return self.out_conv(x)
class GlobalContext(nn.Module):
def __init__(self, *, chan_in, chan_out):
super().__init__()
self.to_k = nn.Conv2d(chan_in, 1, 1)
chan_intermediate = max(3, chan_out // 2)
self.net = nn.Sequential(
nn.Conv2d(chan_in, chan_intermediate, 1),
nn.LeakyReLU(0.1),
nn.Conv2d(chan_intermediate, chan_out, 1),
nn.Sigmoid(),
)
def forward(self, x):
context = self.to_k(x)
context = context.flatten(2).softmax(dim=-1)
out = einsum("b i n, b c n -> b c i", context, x.flatten(2))
out = out.unsqueeze(-1)
return self.net(out)
class LinearAttention(nn.Module):
def __init__(self, dim, dim_head=64, heads=8, kernel_size=3):
super().__init__()
self.scale = dim_head**-0.5
self.heads = heads
self.dim_head = dim_head
inner_dim = dim_head * heads
self.kernel_size = kernel_size
self.nonlin = nn.GELU()
self.to_lin_q = nn.Conv2d(dim, inner_dim, 1, bias=False)
self.to_lin_kv = DepthWiseConv2d(dim, inner_dim * 2, 3, padding=1, bias=False)
self.to_q = nn.Conv2d(dim, inner_dim, 1, bias=False)
self.to_kv = nn.Conv2d(dim, inner_dim * 2, 1, bias=False)
self.to_out = nn.Conv2d(inner_dim * 2, dim, 1)
def forward(self, fmap):
h, x, y = self.heads, *fmap.shape[-2:]
# linear attention
lin_q, lin_k, lin_v = (
self.to_lin_q(fmap),
*self.to_lin_kv(fmap).chunk(2, dim=1),
)
lin_q, lin_k, lin_v = map(
lambda t: rearrange(t, "b (h c) x y -> (b h) (x y) c", h=h),
(lin_q, lin_k, lin_v),
)
lin_q = lin_q.softmax(dim=-1)
lin_k = lin_k.softmax(dim=-2)
lin_q = lin_q * self.scale
context = einsum("b n d, b n e -> b d e", lin_k, lin_v)
lin_out = einsum("b n d, b d e -> b n e", lin_q, context)
lin_out = rearrange(lin_out, "(b h) (x y) d -> b (h d) x y", h=h, x=x, y=y)
# conv-like full attention
q, k, v = (self.to_q(fmap), *self.to_kv(fmap).chunk(2, dim=1))
q, k, v = map(
lambda t: rearrange(t, "b (h c) x y -> (b h) c x y", h=h), (q, k, v)
)
k = F.unfold(k, kernel_size=self.kernel_size, padding=self.kernel_size // 2)
v = F.unfold(v, kernel_size=self.kernel_size, padding=self.kernel_size // 2)
k, v = map(
lambda t: rearrange(t, "b (d j) n -> b n j d", d=self.dim_head), (k, v)
)
q = rearrange(q, "b c ... -> b (...) c") * self.scale
sim = einsum("b i d, b i j d -> b i j", q, k)
sim = sim - sim.amax(dim=-1, keepdim=True).detach()
attn = sim.softmax(dim=-1)
full_out = einsum("b i j, b i j d -> b i d", attn, v)
full_out = rearrange(full_out, "(b h) (x y) d -> b (h d) x y", h=h, x=x, y=y)
# add outputs of linear attention + conv like full attention
lin_out = self.nonlin(lin_out)
out = torch.cat((lin_out, full_out), dim=1)
return self.to_out(out)
class Noise(nn.Module):
def __init__(self):
super().__init__()
self.weight = nn.Parameter(torch.zeros(1))
def forward(self, x, noise=None):
b, _, h, w, device = *x.shape, x.device
if not exists(noise):
noise = torch.randn(b, 1, h, w, device=device)
return x + self.weight * noise
class PixelShuffleUpsample(nn.Module):
def __init__(self, dim, dim_out=None):
super().__init__()
dim_out = default(dim_out, dim)
conv = nn.Conv2d(dim, dim_out * 4, 1)
self.net = nn.Sequential(conv, nn.SiLU(), nn.PixelShuffle(2))
self.init_conv_(conv)
def init_conv_(self, conv):
o, i, h, w = conv.weight.shape
conv_weight = torch.empty(o // 4, i, h, w)
nn.init.kaiming_uniform_(conv_weight)
conv_weight = repeat(conv_weight, "o ... -> (o 4) ...")
conv.weight.data.copy_(conv_weight)
nn.init.zeros_(conv.bias.data)
def forward(self, x):
return self.net(x)
class PreNorm(nn.Module):
def __init__(self, dim, fn):
super().__init__()
self.fn = fn
self.norm = ChanNorm(dim)
def forward(self, x):
return self.fn(self.norm(x))