max_size: + return None + return f.read(length_of_header) + +def set_attr(obj, attr, value): + attrs = attr.split(".") + for name in attrs[:-1]: + obj = getattr(obj, name) + prev = getattr(obj, attrs[-1]) + setattr(obj, attrs[-1], torch.nn.Parameter(value, requires_grad=False)) + del prev + +def copy_to_param(obj, attr, value): + # inplace update tensor instead of replacing it + attrs = attr.split(".") + for name in attrs[:-1]: + obj = getattr(obj, name) + prev = getattr(obj, attrs[-1]) + prev.data.copy_(value) + +def get_attr(obj, attr): + attrs = attr.split(".") + for name in attrs: + obj = getattr(obj, name) + return obj + +def bislerp(samples, width, height): + def slerp(b1, b2, r): + '''slerps batches b1, b2 according to ratio r, batches should be flat e.g. NxC''' + + c = b1.shape[-1] + + #norms + b1_norms = torch.norm(b1, dim=-1, keepdim=True) + b2_norms = torch.norm(b2, dim=-1, keepdim=True) + + #normalize + b1_normalized = b1 / b1_norms + b2_normalized = b2 / b2_norms + + #zero when norms are zero + b1_normalized[b1_norms.expand(-1,c) == 0.0] = 0.0 + b2_normalized[b2_norms.expand(-1,c) == 0.0] = 0.0 + + #slerp + dot = (b1_normalized*b2_normalized).sum(1) + omega = torch.acos(dot) + so = torch.sin(omega) + + #technically not mathematically correct, but more pleasing? + res = (torch.sin((1.0-r.squeeze(1))*omega)/so).unsqueeze(1)*b1_normalized + (torch.sin(r.squeeze(1)*omega)/so).unsqueeze(1) * b2_normalized + res *= (b1_norms * (1.0-r) + b2_norms * r).expand(-1,c) + + #edge cases for same or polar opposites + res[dot > 1 - 1e-5] = b1[dot > 1 - 1e-5] + res[dot < 1e-5 - 1] = (b1 * (1.0-r) + b2 * r)[dot < 1e-5 - 1] + return res + + def generate_bilinear_data(length_old, length_new, device): + coords_1 = torch.arange(length_old, dtype=torch.float32, device=device).reshape((1,1,1,-1)) + coords_1 = torch.nn.functional.interpolate(coords_1, size=(1, length_new), mode="bilinear") + ratios = coords_1 - coords_1.floor() + coords_1 = coords_1.to(torch.int64) + + coords_2 = torch.arange(length_old, dtype=torch.float32, device=device).reshape((1,1,1,-1)) + 1 + coords_2[:,:,:,-1] -= 1 + coords_2 = torch.nn.functional.interpolate(coords_2, size=(1, length_new), mode="bilinear") + coords_2 = coords_2.to(torch.int64) + return ratios, coords_1, coords_2 + + orig_dtype = samples.dtype + samples = samples.float() + n,c,h,w = samples.shape + h_new, w_new = (height, width) + + #linear w + ratios, coords_1, coords_2 = generate_bilinear_data(w, w_new, samples.device) + coords_1 = coords_1.expand((n, c, h, -1)) + coords_2 = coords_2.expand((n, c, h, -1)) + ratios = ratios.expand((n, 1, h, -1)) + + pass_1 = samples.gather(-1,coords_1).movedim(1, -1).reshape((-1,c)) + pass_2 = samples.gather(-1,coords_2).movedim(1, -1).reshape((-1,c)) + ratios = ratios.movedim(1, -1).reshape((-1,1)) + + result = slerp(pass_1, pass_2, ratios) + result = result.reshape(n, h, w_new, c).movedim(-1, 1) + + #linear h + ratios, coords_1, coords_2 = generate_bilinear_data(h, h_new, samples.device) + coords_1 = coords_1.reshape((1,1,-1,1)).expand((n, c, -1, w_new)) + coords_2 = coords_2.reshape((1,1,-1,1)).expand((n, c, -1, w_new)) + ratios = ratios.reshape((1,1,-1,1)).expand((n, 1, -1, w_new)) + + pass_1 = result.gather(-2,coords_1).movedim(1, -1).reshape((-1,c)) + pass_2 = result.gather(-2,coords_2).movedim(1, -1).reshape((-1,c)) + ratios = ratios.movedim(1, -1).reshape((-1,1)) + + result = slerp(pass_1, pass_2, ratios) + result = result.reshape(n, h_new, w_new, c).movedim(-1, 1) + return result.to(orig_dtype) + +def lanczos(samples, width, height): + images = [Image.fromarray(np.clip(255. * image.movedim(0, -1).cpu().numpy(), 0, 255).astype(np.uint8)) for image in samples] + images = [image.resize((width, height), resample=Image.Resampling.LANCZOS) for image in images] + images = [torch.from_numpy(np.array(image).astype(np.float32) / 255.0).movedim(-1, 0) for image in images] + result = torch.stack(images) + return result.to(samples.device, samples.dtype) + +def common_upscale(samples, width, height, upscale_method, crop): + if crop == "center": + old_width = samples.shape[3] + old_height = samples.shape[2] + old_aspect = old_width / old_height + new_aspect = width / height + x = 0 + y = 0 + if old_aspect > new_aspect: + x = round((old_width - old_width * (new_aspect / old_aspect)) / 2) + elif old_aspect < new_aspect: + y = round((old_height - old_height * (old_aspect / new_aspect)) / 2) + s = samples[:,:,y:old_height-y,x:old_width-x] + else: + s = samples + + if upscale_method == "bislerp": + return bislerp(s, width, height) + elif upscale_method == "lanczos": + return lanczos(s, width, height) + else: + return torch.nn.functional.interpolate(s, size=(height, width), mode=upscale_method) + +def get_tiled_scale_steps(width, height, tile_x, tile_y, overlap): + return math.ceil((height / (tile_y - overlap))) * math.ceil((width / (tile_x - overlap))) + +@torch.inference_mode() +def tiled_scale(samples, function, tile_x=64, tile_y=64, overlap = 8, upscale_amount = 4, out_channels = 3, output_device="cpu", pbar = None): + output = torch.empty((samples.shape[0], out_channels, round(samples.shape[2] * upscale_amount), round(samples.shape[3] * upscale_amount)), device=output_device) + for b in range(samples.shape[0]): + s = samples[b:b+1] + out = torch.zeros((s.shape[0], out_channels, round(s.shape[2] * upscale_amount), round(s.shape[3] * upscale_amount)), device=output_device) + out_div = torch.zeros((s.shape[0], out_channels, round(s.shape[2] * upscale_amount), round(s.shape[3] * upscale_amount)), device=output_device) + for y in range(0, s.shape[2], tile_y - overlap): + for x in range(0, s.shape[3], tile_x - overlap): + s_in = s[:,:,y:y+tile_y,x:x+tile_x] + + ps = function(s_in).to(output_device) + mask = torch.ones_like(ps) + feather = round(overlap * upscale_amount) + for t in range(feather): + mask[:,:,t:1+t,:] *= ((1.0/feather) * (t + 1)) + mask[:,:,mask.shape[2] -1 -t: mask.shape[2]-t,:] *= ((1.0/feather) * (t + 1)) + mask[:,:,:,t:1+t] *= ((1.0/feather) * (t + 1)) + mask[:,:,:,mask.shape[3]- 1 - t: mask.shape[3]- t] *= ((1.0/feather) * (t + 1)) + out[:,:,round(y*upscale_amount):round((y+tile_y)*upscale_amount),round(x*upscale_amount):round((x+tile_x)*upscale_amount)] += ps * mask + out_div[:,:,round(y*upscale_amount):round((y+tile_y)*upscale_amount),round(x*upscale_amount):round((x+tile_x)*upscale_amount)] += mask + if pbar is not None: + pbar.update(1) + + output[b:b+1] = out/out_div + return output + +PROGRESS_BAR_ENABLED = True +def set_progress_bar_enabled(enabled): + global PROGRESS_BAR_ENABLED + PROGRESS_BAR_ENABLED = enabled + +PROGRESS_BAR_HOOK = None +def set_progress_bar_global_hook(function): + global PROGRESS_BAR_HOOK + PROGRESS_BAR_HOOK = function + +class ProgressBar: + def __init__(self, total): + global PROGRESS_BAR_HOOK + self.total = total + self.current = 0 + self.hook = PROGRESS_BAR_HOOK + + def update_absolute(self, value, total=None, preview=None): + if total is not None: + self.total = total + if value > self.total: + value = self.total + self.current = value + if self.hook is not None: + self.hook(self.current, self.total, preview) + + def update(self, value): + self.update_absolute(self.current + value) diff --git a/ldm_patched/pfn/__init__.py b/ldm_patched/pfn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ldm_patched/pfn/__pycache__/__init__.cpython-310.pyc b/ldm_patched/pfn/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff5e288297cd96fb5d5b7660967c13a4c3b34a8f Binary files /dev/null and b/ldm_patched/pfn/__pycache__/__init__.cpython-310.pyc differ diff --git a/ldm_patched/pfn/__pycache__/model_loading.cpython-310.pyc b/ldm_patched/pfn/__pycache__/model_loading.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d9fce328cce4b693c738717aa422e68dea5d7251 Binary files /dev/null and b/ldm_patched/pfn/__pycache__/model_loading.cpython-310.pyc differ diff --git a/ldm_patched/pfn/__pycache__/types.cpython-310.pyc b/ldm_patched/pfn/__pycache__/types.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..996b3c5a18d93277e049dadeadec81e364def121 Binary files /dev/null and b/ldm_patched/pfn/__pycache__/types.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/DAT.py b/ldm_patched/pfn/architecture/DAT.py new file mode 100644 index 0000000000000000000000000000000000000000..0bcc26ef422b73cef41744e2203901a3d290c2f0 --- /dev/null +++ b/ldm_patched/pfn/architecture/DAT.py @@ -0,0 +1,1182 @@ +# pylint: skip-file +import math +import re + +import numpy as np +import torch +import torch.nn as nn +import torch.utils.checkpoint as checkpoint +from einops import rearrange +from einops.layers.torch import Rearrange +from torch import Tensor +from torch.nn import functional as F + +from .timm.drop import DropPath +from .timm.weight_init import trunc_normal_ + + +def img2windows(img, H_sp, W_sp): + """ + Input: Image (B, C, H, W) + Output: Window Partition (B', N, C) + """ + B, C, H, W = img.shape + img_reshape = img.view(B, C, H // H_sp, H_sp, W // W_sp, W_sp) + img_perm = ( + img_reshape.permute(0, 2, 4, 3, 5, 1).contiguous().reshape(-1, H_sp * W_sp, C) + ) + return img_perm + + +def windows2img(img_splits_hw, H_sp, W_sp, H, W): + """ + Input: Window Partition (B', N, C) + Output: Image (B, H, W, C) + """ + B = int(img_splits_hw.shape[0] / (H * W / H_sp / W_sp)) + + img = img_splits_hw.view(B, H // H_sp, W // W_sp, H_sp, W_sp, -1) + img = img.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return img + + +class SpatialGate(nn.Module): + """Spatial-Gate. + Args: + dim (int): Half of input channels. + """ + + def __init__(self, dim): + super().__init__() + self.norm = nn.LayerNorm(dim) + self.conv = nn.Conv2d( + dim, dim, kernel_size=3, stride=1, padding=1, groups=dim + ) # DW Conv + + def forward(self, x, H, W): + # Split + x1, x2 = x.chunk(2, dim=-1) + B, N, C = x.shape + x2 = ( + self.conv(self.norm(x2).transpose(1, 2).contiguous().view(B, C // 2, H, W)) + .flatten(2) + .transpose(-1, -2) + .contiguous() + ) + + return x1 * x2 + + +class SGFN(nn.Module): + """Spatial-Gate Feed-Forward Network. + Args: + in_features (int): Number of input channels. + hidden_features (int | None): Number of hidden channels. Default: None + out_features (int | None): Number of output channels. Default: None + act_layer (nn.Module): Activation layer. Default: nn.GELU + drop (float): Dropout rate. Default: 0.0 + """ + + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.sg = SpatialGate(hidden_features // 2) + self.fc2 = nn.Linear(hidden_features // 2, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x, H, W): + """ + Input: x: (B, H*W, C), H, W + Output: x: (B, H*W, C) + """ + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + + x = self.sg(x, H, W) + x = self.drop(x) + + x = self.fc2(x) + x = self.drop(x) + return x + + +class DynamicPosBias(nn.Module): + # The implementation builds on Crossformer code https://github.com/cheerss/CrossFormer/blob/main/models/crossformer.py + """Dynamic Relative Position Bias. + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + residual (bool): If True, use residual strage to connect conv. + """ + + def __init__(self, dim, num_heads, residual): + super().__init__() + self.residual = residual + self.num_heads = num_heads + self.pos_dim = dim // 4 + self.pos_proj = nn.Linear(2, self.pos_dim) + self.pos1 = nn.Sequential( + nn.LayerNorm(self.pos_dim), + nn.ReLU(inplace=True), + nn.Linear(self.pos_dim, self.pos_dim), + ) + self.pos2 = nn.Sequential( + nn.LayerNorm(self.pos_dim), + nn.ReLU(inplace=True), + nn.Linear(self.pos_dim, self.pos_dim), + ) + self.pos3 = nn.Sequential( + nn.LayerNorm(self.pos_dim), + nn.ReLU(inplace=True), + nn.Linear(self.pos_dim, self.num_heads), + ) + + def forward(self, biases): + if self.residual: + pos = self.pos_proj(biases) # 2Gh-1 * 2Gw-1, heads + pos = pos + self.pos1(pos) + pos = pos + self.pos2(pos) + pos = self.pos3(pos) + else: + pos = self.pos3(self.pos2(self.pos1(self.pos_proj(biases)))) + return pos + + +class Spatial_Attention(nn.Module): + """Spatial Window Self-Attention. + It supports rectangle window (containing square window). + Args: + dim (int): Number of input channels. + idx (int): The indentix of window. (0/1) + split_size (tuple(int)): Height and Width of spatial window. + dim_out (int | None): The dimension of the attention output. Default: None + num_heads (int): Number of attention heads. Default: 6 + attn_drop (float): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float): Dropout ratio of output. Default: 0.0 + qk_scale (float | None): Override default qk scale of head_dim ** -0.5 if set + position_bias (bool): The dynamic relative position bias. Default: True + """ + + def __init__( + self, + dim, + idx, + split_size=[8, 8], + dim_out=None, + num_heads=6, + attn_drop=0.0, + proj_drop=0.0, + qk_scale=None, + position_bias=True, + ): + super().__init__() + self.dim = dim + self.dim_out = dim_out or dim + self.split_size = split_size + self.num_heads = num_heads + self.idx = idx + self.position_bias = position_bias + + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + if idx == 0: + H_sp, W_sp = self.split_size[0], self.split_size[1] + elif idx == 1: + W_sp, H_sp = self.split_size[0], self.split_size[1] + else: + print("ERROR MODE", idx) + exit(0) + self.H_sp = H_sp + self.W_sp = W_sp + + if self.position_bias: + self.pos = DynamicPosBias(self.dim // 4, self.num_heads, residual=False) + # generate mother-set + position_bias_h = torch.arange(1 - self.H_sp, self.H_sp) + position_bias_w = torch.arange(1 - self.W_sp, self.W_sp) + biases = torch.stack(torch.meshgrid([position_bias_h, position_bias_w])) + biases = biases.flatten(1).transpose(0, 1).contiguous().float() + self.register_buffer("rpe_biases", biases) + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.H_sp) + coords_w = torch.arange(self.W_sp) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) + coords_flatten = torch.flatten(coords, 1) + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] + relative_coords = relative_coords.permute(1, 2, 0).contiguous() + relative_coords[:, :, 0] += self.H_sp - 1 + relative_coords[:, :, 1] += self.W_sp - 1 + relative_coords[:, :, 0] *= 2 * self.W_sp - 1 + relative_position_index = relative_coords.sum(-1) + self.register_buffer("relative_position_index", relative_position_index) + + self.attn_drop = nn.Dropout(attn_drop) + + def im2win(self, x, H, W): + B, N, C = x.shape + x = x.transpose(-2, -1).contiguous().view(B, C, H, W) + x = img2windows(x, self.H_sp, self.W_sp) + x = ( + x.reshape(-1, self.H_sp * self.W_sp, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + .contiguous() + ) + return x + + def forward(self, qkv, H, W, mask=None): + """ + Input: qkv: (B, 3*L, C), H, W, mask: (B, N, N), N is the window size + Output: x (B, H, W, C) + """ + q, k, v = qkv[0], qkv[1], qkv[2] + + B, L, C = q.shape + assert L == H * W, "flatten img_tokens has wrong size" + + # partition the q,k,v, image to window + q = self.im2win(q, H, W) + k = self.im2win(k, H, W) + v = self.im2win(v, H, W) + + q = q * self.scale + attn = q @ k.transpose(-2, -1) # B head N C @ B head C N --> B head N N + + # calculate drpe + if self.position_bias: + pos = self.pos(self.rpe_biases) + # select position bias + relative_position_bias = pos[self.relative_position_index.view(-1)].view( + self.H_sp * self.W_sp, self.H_sp * self.W_sp, -1 + ) + relative_position_bias = relative_position_bias.permute( + 2, 0, 1 + ).contiguous() + attn = attn + relative_position_bias.unsqueeze(0) + + N = attn.shape[3] + + # use mask for shift window + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze( + 0 + ) + attn = attn.view(-1, self.num_heads, N, N) + + attn = nn.functional.softmax(attn, dim=-1, dtype=attn.dtype) + attn = self.attn_drop(attn) + + x = attn @ v + x = x.transpose(1, 2).reshape( + -1, self.H_sp * self.W_sp, C + ) # B head N N @ B head N C + + # merge the window, window to image + x = windows2img(x, self.H_sp, self.W_sp, H, W) # B H' W' C + + return x + + +class Adaptive_Spatial_Attention(nn.Module): + # The implementation builds on CAT code https://github.com/Zhengchen1999/CAT + """Adaptive Spatial Self-Attention + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. Default: 6 + split_size (tuple(int)): Height and Width of spatial window. + shift_size (tuple(int)): Shift size for spatial window. + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None): Override default qk scale of head_dim ** -0.5 if set. + drop (float): Dropout rate. Default: 0.0 + attn_drop (float): Attention dropout rate. Default: 0.0 + rg_idx (int): The indentix of Residual Group (RG) + b_idx (int): The indentix of Block in each RG + """ + + def __init__( + self, + dim, + num_heads, + reso=64, + split_size=[8, 8], + shift_size=[1, 2], + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + rg_idx=0, + b_idx=0, + ): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.split_size = split_size + self.shift_size = shift_size + self.b_idx = b_idx + self.rg_idx = rg_idx + self.patches_resolution = reso + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + + assert ( + 0 <= self.shift_size[0] < self.split_size[0] + ), "shift_size must in 0-split_size0" + assert ( + 0 <= self.shift_size[1] < self.split_size[1] + ), "shift_size must in 0-split_size1" + + self.branch_num = 2 + + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(drop) + + self.attns = nn.ModuleList( + [ + Spatial_Attention( + dim // 2, + idx=i, + split_size=split_size, + num_heads=num_heads // 2, + dim_out=dim // 2, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + position_bias=True, + ) + for i in range(self.branch_num) + ] + ) + + if (self.rg_idx % 2 == 0 and self.b_idx > 0 and (self.b_idx - 2) % 4 == 0) or ( + self.rg_idx % 2 != 0 and self.b_idx % 4 == 0 + ): + attn_mask = self.calculate_mask( + self.patches_resolution, self.patches_resolution + ) + self.register_buffer("attn_mask_0", attn_mask[0]) + self.register_buffer("attn_mask_1", attn_mask[1]) + else: + attn_mask = None + self.register_buffer("attn_mask_0", None) + self.register_buffer("attn_mask_1", None) + + self.dwconv = nn.Sequential( + nn.Conv2d(dim, dim, kernel_size=3, stride=1, padding=1, groups=dim), + nn.BatchNorm2d(dim), + nn.GELU(), + ) + self.channel_interaction = nn.Sequential( + nn.AdaptiveAvgPool2d(1), + nn.Conv2d(dim, dim // 8, kernel_size=1), + nn.BatchNorm2d(dim // 8), + nn.GELU(), + nn.Conv2d(dim // 8, dim, kernel_size=1), + ) + self.spatial_interaction = nn.Sequential( + nn.Conv2d(dim, dim // 16, kernel_size=1), + nn.BatchNorm2d(dim // 16), + nn.GELU(), + nn.Conv2d(dim // 16, 1, kernel_size=1), + ) + + def calculate_mask(self, H, W): + # The implementation builds on Swin Transformer code https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py + # calculate attention mask for shift window + img_mask_0 = torch.zeros((1, H, W, 1)) # 1 H W 1 idx=0 + img_mask_1 = torch.zeros((1, H, W, 1)) # 1 H W 1 idx=1 + h_slices_0 = ( + slice(0, -self.split_size[0]), + slice(-self.split_size[0], -self.shift_size[0]), + slice(-self.shift_size[0], None), + ) + w_slices_0 = ( + slice(0, -self.split_size[1]), + slice(-self.split_size[1], -self.shift_size[1]), + slice(-self.shift_size[1], None), + ) + + h_slices_1 = ( + slice(0, -self.split_size[1]), + slice(-self.split_size[1], -self.shift_size[1]), + slice(-self.shift_size[1], None), + ) + w_slices_1 = ( + slice(0, -self.split_size[0]), + slice(-self.split_size[0], -self.shift_size[0]), + slice(-self.shift_size[0], None), + ) + cnt = 0 + for h in h_slices_0: + for w in w_slices_0: + img_mask_0[:, h, w, :] = cnt + cnt += 1 + cnt = 0 + for h in h_slices_1: + for w in w_slices_1: + img_mask_1[:, h, w, :] = cnt + cnt += 1 + + # calculate mask for window-0 + img_mask_0 = img_mask_0.view( + 1, + H // self.split_size[0], + self.split_size[0], + W // self.split_size[1], + self.split_size[1], + 1, + ) + img_mask_0 = ( + img_mask_0.permute(0, 1, 3, 2, 4, 5) + .contiguous() + .view(-1, self.split_size[0], self.split_size[1], 1) + ) # nW, sw[0], sw[1], 1 + mask_windows_0 = img_mask_0.view(-1, self.split_size[0] * self.split_size[1]) + attn_mask_0 = mask_windows_0.unsqueeze(1) - mask_windows_0.unsqueeze(2) + attn_mask_0 = attn_mask_0.masked_fill( + attn_mask_0 != 0, float(-100.0) + ).masked_fill(attn_mask_0 == 0, float(0.0)) + + # calculate mask for window-1 + img_mask_1 = img_mask_1.view( + 1, + H // self.split_size[1], + self.split_size[1], + W // self.split_size[0], + self.split_size[0], + 1, + ) + img_mask_1 = ( + img_mask_1.permute(0, 1, 3, 2, 4, 5) + .contiguous() + .view(-1, self.split_size[1], self.split_size[0], 1) + ) # nW, sw[1], sw[0], 1 + mask_windows_1 = img_mask_1.view(-1, self.split_size[1] * self.split_size[0]) + attn_mask_1 = mask_windows_1.unsqueeze(1) - mask_windows_1.unsqueeze(2) + attn_mask_1 = attn_mask_1.masked_fill( + attn_mask_1 != 0, float(-100.0) + ).masked_fill(attn_mask_1 == 0, float(0.0)) + + return attn_mask_0, attn_mask_1 + + def forward(self, x, H, W): + """ + Input: x: (B, H*W, C), H, W + Output: x: (B, H*W, C) + """ + B, L, C = x.shape + assert L == H * W, "flatten img_tokens has wrong size" + + qkv = self.qkv(x).reshape(B, -1, 3, C).permute(2, 0, 1, 3) # 3, B, HW, C + # V without partition + v = qkv[2].transpose(-2, -1).contiguous().view(B, C, H, W) + + # image padding + max_split_size = max(self.split_size[0], self.split_size[1]) + pad_l = pad_t = 0 + pad_r = (max_split_size - W % max_split_size) % max_split_size + pad_b = (max_split_size - H % max_split_size) % max_split_size + + qkv = qkv.reshape(3 * B, H, W, C).permute(0, 3, 1, 2) # 3B C H W + qkv = ( + F.pad(qkv, (pad_l, pad_r, pad_t, pad_b)) + .reshape(3, B, C, -1) + .transpose(-2, -1) + ) # l r t b + _H = pad_b + H + _W = pad_r + W + _L = _H * _W + + # window-0 and window-1 on split channels [C/2, C/2]; for square windows (e.g., 8x8), window-0 and window-1 can be merged + # shift in block: (0, 4, 8, ...), (2, 6, 10, ...), (0, 4, 8, ...), (2, 6, 10, ...), ... + if (self.rg_idx % 2 == 0 and self.b_idx > 0 and (self.b_idx - 2) % 4 == 0) or ( + self.rg_idx % 2 != 0 and self.b_idx % 4 == 0 + ): + qkv = qkv.view(3, B, _H, _W, C) + qkv_0 = torch.roll( + qkv[:, :, :, :, : C // 2], + shifts=(-self.shift_size[0], -self.shift_size[1]), + dims=(2, 3), + ) + qkv_0 = qkv_0.view(3, B, _L, C // 2) + qkv_1 = torch.roll( + qkv[:, :, :, :, C // 2 :], + shifts=(-self.shift_size[1], -self.shift_size[0]), + dims=(2, 3), + ) + qkv_1 = qkv_1.view(3, B, _L, C // 2) + + if self.patches_resolution != _H or self.patches_resolution != _W: + mask_tmp = self.calculate_mask(_H, _W) + x1_shift = self.attns[0](qkv_0, _H, _W, mask=mask_tmp[0].to(x.device)) + x2_shift = self.attns[1](qkv_1, _H, _W, mask=mask_tmp[1].to(x.device)) + else: + x1_shift = self.attns[0](qkv_0, _H, _W, mask=self.attn_mask_0) + x2_shift = self.attns[1](qkv_1, _H, _W, mask=self.attn_mask_1) + + x1 = torch.roll( + x1_shift, shifts=(self.shift_size[0], self.shift_size[1]), dims=(1, 2) + ) + x2 = torch.roll( + x2_shift, shifts=(self.shift_size[1], self.shift_size[0]), dims=(1, 2) + ) + x1 = x1[:, :H, :W, :].reshape(B, L, C // 2) + x2 = x2[:, :H, :W, :].reshape(B, L, C // 2) + # attention output + attened_x = torch.cat([x1, x2], dim=2) + + else: + x1 = self.attns[0](qkv[:, :, :, : C // 2], _H, _W)[:, :H, :W, :].reshape( + B, L, C // 2 + ) + x2 = self.attns[1](qkv[:, :, :, C // 2 :], _H, _W)[:, :H, :W, :].reshape( + B, L, C // 2 + ) + # attention output + attened_x = torch.cat([x1, x2], dim=2) + + # convolution output + conv_x = self.dwconv(v) + + # Adaptive Interaction Module (AIM) + # C-Map (before sigmoid) + channel_map = ( + self.channel_interaction(conv_x) + .permute(0, 2, 3, 1) + .contiguous() + .view(B, 1, C) + ) + # S-Map (before sigmoid) + attention_reshape = attened_x.transpose(-2, -1).contiguous().view(B, C, H, W) + spatial_map = self.spatial_interaction(attention_reshape) + + # C-I + attened_x = attened_x * torch.sigmoid(channel_map) + # S-I + conv_x = torch.sigmoid(spatial_map) * conv_x + conv_x = conv_x.permute(0, 2, 3, 1).contiguous().view(B, L, C) + + x = attened_x + conv_x + + x = self.proj(x) + x = self.proj_drop(x) + + return x + + +class Adaptive_Channel_Attention(nn.Module): + # The implementation builds on XCiT code https://github.com/facebookresearch/xcit + """Adaptive Channel Self-Attention + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. Default: 6 + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None): Override default qk scale of head_dim ** -0.5 if set. + attn_drop (float): Attention dropout rate. Default: 0.0 + drop_path (float): Stochastic depth rate. Default: 0.0 + """ + + def __init__( + self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + super().__init__() + self.num_heads = num_heads + self.temperature = nn.Parameter(torch.ones(num_heads, 1, 1)) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + self.dwconv = nn.Sequential( + nn.Conv2d(dim, dim, kernel_size=3, stride=1, padding=1, groups=dim), + nn.BatchNorm2d(dim), + nn.GELU(), + ) + self.channel_interaction = nn.Sequential( + nn.AdaptiveAvgPool2d(1), + nn.Conv2d(dim, dim // 8, kernel_size=1), + nn.BatchNorm2d(dim // 8), + nn.GELU(), + nn.Conv2d(dim // 8, dim, kernel_size=1), + ) + self.spatial_interaction = nn.Sequential( + nn.Conv2d(dim, dim // 16, kernel_size=1), + nn.BatchNorm2d(dim // 16), + nn.GELU(), + nn.Conv2d(dim // 16, 1, kernel_size=1), + ) + + def forward(self, x, H, W): + """ + Input: x: (B, H*W, C), H, W + Output: x: (B, H*W, C) + """ + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) + qkv = qkv.permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + + q = q.transpose(-2, -1) + k = k.transpose(-2, -1) + v = v.transpose(-2, -1) + + v_ = v.reshape(B, C, N).contiguous().view(B, C, H, W) + + q = torch.nn.functional.normalize(q, dim=-1) + k = torch.nn.functional.normalize(k, dim=-1) + + attn = (q @ k.transpose(-2, -1)) * self.temperature + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + # attention output + attened_x = (attn @ v).permute(0, 3, 1, 2).reshape(B, N, C) + + # convolution output + conv_x = self.dwconv(v_) + + # Adaptive Interaction Module (AIM) + # C-Map (before sigmoid) + attention_reshape = attened_x.transpose(-2, -1).contiguous().view(B, C, H, W) + channel_map = self.channel_interaction(attention_reshape) + # S-Map (before sigmoid) + spatial_map = ( + self.spatial_interaction(conv_x) + .permute(0, 2, 3, 1) + .contiguous() + .view(B, N, 1) + ) + + # S-I + attened_x = attened_x * torch.sigmoid(spatial_map) + # C-I + conv_x = conv_x * torch.sigmoid(channel_map) + conv_x = conv_x.permute(0, 2, 3, 1).contiguous().view(B, N, C) + + x = attened_x + conv_x + + x = self.proj(x) + x = self.proj_drop(x) + + return x + + +class DATB(nn.Module): + def __init__( + self, + dim, + num_heads, + reso=64, + split_size=[2, 4], + shift_size=[1, 2], + expansion_factor=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + rg_idx=0, + b_idx=0, + ): + super().__init__() + + self.norm1 = norm_layer(dim) + + if b_idx % 2 == 0: + # DSTB + self.attn = Adaptive_Spatial_Attention( + dim, + num_heads=num_heads, + reso=reso, + split_size=split_size, + shift_size=shift_size, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + rg_idx=rg_idx, + b_idx=b_idx, + ) + else: + # DCTB + self.attn = Adaptive_Channel_Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + ffn_hidden_dim = int(dim * expansion_factor) + self.ffn = SGFN( + in_features=dim, + hidden_features=ffn_hidden_dim, + out_features=dim, + act_layer=act_layer, + ) + self.norm2 = norm_layer(dim) + + def forward(self, x, x_size): + """ + Input: x: (B, H*W, C), x_size: (H, W) + Output: x: (B, H*W, C) + """ + H, W = x_size + x = x + self.drop_path(self.attn(self.norm1(x), H, W)) + x = x + self.drop_path(self.ffn(self.norm2(x), H, W)) + + return x + + +class ResidualGroup(nn.Module): + """ResidualGroup + Args: + dim (int): Number of input channels. + reso (int): Input resolution. + num_heads (int): Number of attention heads. + split_size (tuple(int)): Height and Width of spatial window. + expansion_factor (float): Ratio of ffn hidden dim to embedding dim. + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None): Override default qk scale of head_dim ** -0.5 if set. Default: None + drop (float): Dropout rate. Default: 0 + attn_drop(float): Attention dropout rate. Default: 0 + drop_paths (float | None): Stochastic depth rate. + act_layer (nn.Module): Activation layer. Default: nn.GELU + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm + depth (int): Number of dual aggregation Transformer blocks in residual group. + use_chk (bool): Whether to use checkpointing to save memory. + resi_connection: The convolutional block before residual connection. '1conv'/'3conv' + """ + + def __init__( + self, + dim, + reso, + num_heads, + split_size=[2, 4], + expansion_factor=4.0, + qkv_bias=False, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_paths=None, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + depth=2, + use_chk=False, + resi_connection="1conv", + rg_idx=0, + ): + super().__init__() + self.use_chk = use_chk + self.reso = reso + + self.blocks = nn.ModuleList( + [ + DATB( + dim=dim, + num_heads=num_heads, + reso=reso, + split_size=split_size, + shift_size=[split_size[0] // 2, split_size[1] // 2], + expansion_factor=expansion_factor, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_paths[i], + act_layer=act_layer, + norm_layer=norm_layer, + rg_idx=rg_idx, + b_idx=i, + ) + for i in range(depth) + ] + ) + + if resi_connection == "1conv": + self.conv = nn.Conv2d(dim, dim, 3, 1, 1) + elif resi_connection == "3conv": + self.conv = nn.Sequential( + nn.Conv2d(dim, dim // 4, 3, 1, 1), + nn.LeakyReLU(negative_slope=0.2, inplace=True), + nn.Conv2d(dim // 4, dim // 4, 1, 1, 0), + nn.LeakyReLU(negative_slope=0.2, inplace=True), + nn.Conv2d(dim // 4, dim, 3, 1, 1), + ) + + def forward(self, x, x_size): + """ + Input: x: (B, H*W, C), x_size: (H, W) + Output: x: (B, H*W, C) + """ + H, W = x_size + res = x + for blk in self.blocks: + if self.use_chk: + x = checkpoint.checkpoint(blk, x, x_size) + else: + x = blk(x, x_size) + x = rearrange(x, "b (h w) c -> b c h w", h=H, w=W) + x = self.conv(x) + x = rearrange(x, "b c h w -> b (h w) c") + x = res + x + + return x + + +class Upsample(nn.Sequential): + """Upsample module. + Args: + scale (int): Scale factor. Supported scales: 2^n and 3. + num_feat (int): Channel number of intermediate features. + """ + + def __init__(self, scale, num_feat): + m = [] + if (scale & (scale - 1)) == 0: # scale = 2^n + for _ in range(int(math.log(scale, 2))): + m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1)) + m.append(nn.PixelShuffle(2)) + elif scale == 3: + m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1)) + m.append(nn.PixelShuffle(3)) + else: + raise ValueError( + f"scale {scale} is not supported. " "Supported scales: 2^n and 3." + ) + super(Upsample, self).__init__(*m) + + +class UpsampleOneStep(nn.Sequential): + """UpsampleOneStep module (the difference with Upsample is that it always only has 1conv + 1pixelshuffle) + Used in lightweight SR to save parameters. + + Args: + scale (int): Scale factor. Supported scales: 2^n and 3. + num_feat (int): Channel number of intermediate features. + + """ + + def __init__(self, scale, num_feat, num_out_ch, input_resolution=None): + self.num_feat = num_feat + self.input_resolution = input_resolution + m = [] + m.append(nn.Conv2d(num_feat, (scale**2) * num_out_ch, 3, 1, 1)) + m.append(nn.PixelShuffle(scale)) + super(UpsampleOneStep, self).__init__(*m) + + def flops(self): + h, w = self.input_resolution + flops = h * w * self.num_feat * 3 * 9 + return flops + + +class DAT(nn.Module): + """Dual Aggregation Transformer + Args: + img_size (int): Input image size. Default: 64 + in_chans (int): Number of input image channels. Default: 3 + embed_dim (int): Patch embedding dimension. Default: 180 + depths (tuple(int)): Depth of each residual group (number of DATB in each RG). + split_size (tuple(int)): Height and Width of spatial window. + num_heads (tuple(int)): Number of attention heads in different residual groups. + expansion_factor (float): Ratio of ffn hidden dim to embedding dim. Default: 4 + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None): Override default qk scale of head_dim ** -0.5 if set. Default: None + drop_rate (float): Dropout rate. Default: 0 + attn_drop_rate (float): Attention dropout rate. Default: 0 + drop_path_rate (float): Stochastic depth rate. Default: 0.1 + act_layer (nn.Module): Activation layer. Default: nn.GELU + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm + use_chk (bool): Whether to use checkpointing to save memory. + upscale: Upscale factor. 2/3/4 for image SR + img_range: Image range. 1. or 255. + resi_connection: The convolutional block before residual connection. '1conv'/'3conv' + """ + + def __init__(self, state_dict): + super().__init__() + + # defaults + img_size = 64 + in_chans = 3 + embed_dim = 180 + split_size = [2, 4] + depth = [2, 2, 2, 2] + num_heads = [2, 2, 2, 2] + expansion_factor = 4.0 + qkv_bias = True + qk_scale = None + drop_rate = 0.0 + attn_drop_rate = 0.0 + drop_path_rate = 0.1 + act_layer = nn.GELU + norm_layer = nn.LayerNorm + use_chk = False + upscale = 2 + img_range = 1.0 + resi_connection = "1conv" + upsampler = "pixelshuffle" + + self.model_arch = "DAT" + self.sub_type = "SR" + self.state = state_dict + + state_keys = state_dict.keys() + if "conv_before_upsample.0.weight" in state_keys: + if "conv_up1.weight" in state_keys: + upsampler = "nearest+conv" + else: + upsampler = "pixelshuffle" + supports_fp16 = False + elif "upsample.0.weight" in state_keys: + upsampler = "pixelshuffledirect" + else: + upsampler = "" + + num_feat = ( + state_dict.get("conv_before_upsample.0.weight", None).shape[1] + if state_dict.get("conv_before_upsample.weight", None) + else 64 + ) + + num_in_ch = state_dict["conv_first.weight"].shape[1] + in_chans = num_in_ch + if "conv_last.weight" in state_keys: + num_out_ch = state_dict["conv_last.weight"].shape[0] + else: + num_out_ch = num_in_ch + + upscale = 1 + if upsampler == "nearest+conv": + upsample_keys = [ + x for x in state_keys if "conv_up" in x and "bias" not in x + ] + + for upsample_key in upsample_keys: + upscale *= 2 + elif upsampler == "pixelshuffle": + upsample_keys = [ + x + for x in state_keys + if "upsample" in x and "conv" not in x and "bias" not in x + ] + for upsample_key in upsample_keys: + shape = state_dict[upsample_key].shape[0] + upscale *= math.sqrt(shape // num_feat) + upscale = int(upscale) + elif upsampler == "pixelshuffledirect": + upscale = int( + math.sqrt(state_dict["upsample.0.bias"].shape[0] // num_out_ch) + ) + + max_layer_num = 0 + max_block_num = 0 + for key in state_keys: + result = re.match(r"layers.(\d*).blocks.(\d*).norm1.weight", key) + if result: + layer_num, block_num = result.groups() + max_layer_num = max(max_layer_num, int(layer_num)) + max_block_num = max(max_block_num, int(block_num)) + + depth = [max_block_num + 1 for _ in range(max_layer_num + 1)] + + if "layers.0.blocks.1.attn.temperature" in state_keys: + num_heads_num = state_dict["layers.0.blocks.1.attn.temperature"].shape[0] + num_heads = [num_heads_num for _ in range(max_layer_num + 1)] + else: + num_heads = depth + + embed_dim = state_dict["conv_first.weight"].shape[0] + expansion_factor = float( + state_dict["layers.0.blocks.0.ffn.fc1.weight"].shape[0] / embed_dim + ) + + # TODO: could actually count the layers, but this should do + if "layers.0.conv.4.weight" in state_keys: + resi_connection = "3conv" + else: + resi_connection = "1conv" + + if "layers.0.blocks.2.attn.attn_mask_0" in state_keys: + attn_mask_0_x, attn_mask_0_y, attn_mask_0_z = state_dict[ + "layers.0.blocks.2.attn.attn_mask_0" + ].shape + + img_size = int(math.sqrt(attn_mask_0_x * attn_mask_0_y)) + + if "layers.0.blocks.0.attn.attns.0.rpe_biases" in state_keys: + split_sizes = ( + state_dict["layers.0.blocks.0.attn.attns.0.rpe_biases"][-1] + 1 + ) + split_size = [int(x) for x in split_sizes] + + self.in_nc = num_in_ch + self.out_nc = num_out_ch + self.num_feat = num_feat + self.embed_dim = embed_dim + self.num_heads = num_heads + self.depth = depth + self.scale = upscale + self.upsampler = upsampler + self.img_size = img_size + self.img_range = img_range + self.expansion_factor = expansion_factor + self.resi_connection = resi_connection + self.split_size = split_size + + self.supports_fp16 = False # Too much weirdness to support this at the moment + self.supports_bfp16 = True + self.min_size_restriction = 16 + + num_in_ch = in_chans + num_out_ch = in_chans + num_feat = 64 + self.img_range = img_range + if in_chans == 3: + rgb_mean = (0.4488, 0.4371, 0.4040) + self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1) + else: + self.mean = torch.zeros(1, 1, 1, 1) + self.upscale = upscale + self.upsampler = upsampler + + # ------------------------- 1, Shallow Feature Extraction ------------------------- # + self.conv_first = nn.Conv2d(num_in_ch, embed_dim, 3, 1, 1) + + # ------------------------- 2, Deep Feature Extraction ------------------------- # + self.num_layers = len(depth) + self.use_chk = use_chk + self.num_features = ( + self.embed_dim + ) = embed_dim # num_features for consistency with other models + heads = num_heads + + self.before_RG = nn.Sequential( + Rearrange("b c h w -> b (h w) c"), nn.LayerNorm(embed_dim) + ) + + curr_dim = embed_dim + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, np.sum(depth)) + ] # stochastic depth decay rule + + self.layers = nn.ModuleList() + for i in range(self.num_layers): + layer = ResidualGroup( + dim=embed_dim, + num_heads=heads[i], + reso=img_size, + split_size=split_size, + expansion_factor=expansion_factor, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_paths=dpr[sum(depth[:i]) : sum(depth[: i + 1])], + act_layer=act_layer, + norm_layer=norm_layer, + depth=depth[i], + use_chk=use_chk, + resi_connection=resi_connection, + rg_idx=i, + ) + self.layers.append(layer) + + self.norm = norm_layer(curr_dim) + # build the last conv layer in deep feature extraction + if resi_connection == "1conv": + self.conv_after_body = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1) + elif resi_connection == "3conv": + # to save parameters and memory + self.conv_after_body = nn.Sequential( + nn.Conv2d(embed_dim, embed_dim // 4, 3, 1, 1), + nn.LeakyReLU(negative_slope=0.2, inplace=True), + nn.Conv2d(embed_dim // 4, embed_dim // 4, 1, 1, 0), + nn.LeakyReLU(negative_slope=0.2, inplace=True), + nn.Conv2d(embed_dim // 4, embed_dim, 3, 1, 1), + ) + + # ------------------------- 3, Reconstruction ------------------------- # + if self.upsampler == "pixelshuffle": + # for classical SR + self.conv_before_upsample = nn.Sequential( + nn.Conv2d(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True) + ) + self.upsample = Upsample(upscale, num_feat) + self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1) + elif self.upsampler == "pixelshuffledirect": + # for lightweight SR (to save parameters) + self.upsample = UpsampleOneStep( + upscale, embed_dim, num_out_ch, (img_size, img_size) + ) + + self.apply(self._init_weights) + self.load_state_dict(state_dict, strict=True) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance( + m, (nn.LayerNorm, nn.BatchNorm2d, nn.GroupNorm, nn.InstanceNorm2d) + ): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def forward_features(self, x): + _, _, H, W = x.shape + x_size = [H, W] + x = self.before_RG(x) + for layer in self.layers: + x = layer(x, x_size) + x = self.norm(x) + x = rearrange(x, "b (h w) c -> b c h w", h=H, w=W) + + return x + + def forward(self, x): + """ + Input: x: (B, C, H, W) + """ + self.mean = self.mean.type_as(x) + x = (x - self.mean) * self.img_range + + if self.upsampler == "pixelshuffle": + # for image SR + x = self.conv_first(x) + x = self.conv_after_body(self.forward_features(x)) + x + x = self.conv_before_upsample(x) + x = self.conv_last(self.upsample(x)) + elif self.upsampler == "pixelshuffledirect": + # for lightweight SR + x = self.conv_first(x) + x = self.conv_after_body(self.forward_features(x)) + x + x = self.upsample(x) + + x = x / self.img_range + self.mean + return x diff --git a/ldm_patched/pfn/architecture/HAT.py b/ldm_patched/pfn/architecture/HAT.py new file mode 100644 index 0000000000000000000000000000000000000000..7e12ad0fd47a223c4b2a902d296ebeff3403cffb --- /dev/null +++ b/ldm_patched/pfn/architecture/HAT.py @@ -0,0 +1,1277 @@ +# pylint: skip-file +# HAT from https://github.com/XPixelGroup/HAT/blob/main/hat/archs/hat_arch.py +import math +import re + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange + +from .timm.helpers import to_2tuple +from .timm.weight_init import trunc_normal_ + + +def drop_path(x, drop_prob: float = 0.0, training: bool = False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + From: https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py + """ + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * ( + x.ndim - 1 + ) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) + random_tensor.floor_() # binarize + output = x.div(keep_prob) * random_tensor + return output + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + From: https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py + """ + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) # type: ignore + + +class ChannelAttention(nn.Module): + """Channel attention used in RCAN. + Args: + num_feat (int): Channel number of intermediate features. + squeeze_factor (int): Channel squeeze factor. Default: 16. + """ + + def __init__(self, num_feat, squeeze_factor=16): + super(ChannelAttention, self).__init__() + self.attention = nn.Sequential( + nn.AdaptiveAvgPool2d(1), + nn.Conv2d(num_feat, num_feat // squeeze_factor, 1, padding=0), + nn.ReLU(inplace=True), + nn.Conv2d(num_feat // squeeze_factor, num_feat, 1, padding=0), + nn.Sigmoid(), + ) + + def forward(self, x): + y = self.attention(x) + return x * y + + +class CAB(nn.Module): + def __init__(self, num_feat, compress_ratio=3, squeeze_factor=30): + super(CAB, self).__init__() + + self.cab = nn.Sequential( + nn.Conv2d(num_feat, num_feat // compress_ratio, 3, 1, 1), + nn.GELU(), + nn.Conv2d(num_feat // compress_ratio, num_feat, 3, 1, 1), + ChannelAttention(num_feat, squeeze_factor), + ) + + def forward(self, x): + return self.cab(x) + + +class Mlp(nn.Module): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (b, h, w, c) + window_size (int): window size + Returns: + windows: (num_windows*b, window_size, window_size, c) + """ + b, h, w, c = x.shape + x = x.view(b, h // window_size, window_size, w // window_size, window_size, c) + windows = ( + x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, c) + ) + return windows + + +def window_reverse(windows, window_size, h, w): + """ + Args: + windows: (num_windows*b, window_size, window_size, c) + window_size (int): Window size + h (int): Height of image + w (int): Width of image + Returns: + x: (b, h, w, c) + """ + b = int(windows.shape[0] / (h * w / window_size / window_size)) + x = windows.view( + b, h // window_size, w // window_size, window_size, window_size, -1 + ) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(b, h, w, -1) + return x + + +class WindowAttention(nn.Module): + r"""Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__( + self, + dim, + window_size, + num_heads, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( # type: ignore + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads) + ) # 2*Wh-1 * 2*Ww-1, nH + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=0.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, rpi, mask=None): + """ + Args: + x: input features with shape of (num_windows*b, n, c) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + b_, n, c = x.shape + qkv = ( + self.qkv(x) + .reshape(b_, n, 3, self.num_heads, c // self.num_heads) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = ( + qkv[0], + qkv[1], + qkv[2], + ) # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + relative_position_bias = self.relative_position_bias_table[rpi.view(-1)].view( + self.window_size[0] * self.window_size[1], + self.window_size[0] * self.window_size[1], + -1, + ) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1 + ).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nw = mask.shape[0] + attn = attn.view(b_ // nw, nw, self.num_heads, n, n) + mask.unsqueeze( + 1 + ).unsqueeze(0) + attn = attn.view(-1, self.num_heads, n, n) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(b_, n, c) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class HAB(nn.Module): + r"""Hybrid Attention Block. + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__( + self, + dim, + input_resolution, + num_heads, + window_size=7, + shift_size=0, + compress_ratio=3, + squeeze_factor=30, + conv_scale=0.01, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + if min(self.input_resolution) <= self.window_size: + # if window size is larger than input resolution, we don't partition windows + self.shift_size = 0 + self.window_size = min(self.input_resolution) + assert ( + 0 <= self.shift_size < self.window_size + ), "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, + window_size=to_2tuple(self.window_size), + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + + self.conv_scale = conv_scale + self.conv_block = CAB( + num_feat=dim, compress_ratio=compress_ratio, squeeze_factor=squeeze_factor + ) + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + def forward(self, x, x_size, rpi_sa, attn_mask): + h, w = x_size + b, _, c = x.shape + # assert seq_len == h * w, "input feature has wrong size" + + shortcut = x + x = self.norm1(x) + x = x.view(b, h, w, c) + + # Conv_X + conv_x = self.conv_block(x.permute(0, 3, 1, 2)) + conv_x = conv_x.permute(0, 2, 3, 1).contiguous().view(b, h * w, c) + + # cyclic shift + if self.shift_size > 0: + shifted_x = torch.roll( + x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2) + ) + attn_mask = attn_mask + else: + shifted_x = x + attn_mask = None + + # partition windows + x_windows = window_partition( + shifted_x, self.window_size + ) # nw*b, window_size, window_size, c + x_windows = x_windows.view( + -1, self.window_size * self.window_size, c + ) # nw*b, window_size*window_size, c + + # W-MSA/SW-MSA (to be compatible for testing on images whose shapes are the multiple of window size + attn_windows = self.attn(x_windows, rpi=rpi_sa, mask=attn_mask) + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, self.window_size, c) + shifted_x = window_reverse(attn_windows, self.window_size, h, w) # b h' w' c + + # reverse cyclic shift + if self.shift_size > 0: + attn_x = torch.roll( + shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2) + ) + else: + attn_x = shifted_x + attn_x = attn_x.view(b, h * w, c) + + # FFN + x = shortcut + self.drop_path(attn_x) + conv_x * self.conv_scale + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + +class PatchMerging(nn.Module): + r"""Patch Merging Layer. + Args: + input_resolution (tuple[int]): Resolution of input feature. + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x): + """ + x: b, h*w, c + """ + h, w = self.input_resolution + b, seq_len, c = x.shape + assert seq_len == h * w, "input feature has wrong size" + assert h % 2 == 0 and w % 2 == 0, f"x size ({h}*{w}) are not even." + + x = x.view(b, h, w, c) + + x0 = x[:, 0::2, 0::2, :] # b h/2 w/2 c + x1 = x[:, 1::2, 0::2, :] # b h/2 w/2 c + x2 = x[:, 0::2, 1::2, :] # b h/2 w/2 c + x3 = x[:, 1::2, 1::2, :] # b h/2 w/2 c + x = torch.cat([x0, x1, x2, x3], -1) # b h/2 w/2 4*c + x = x.view(b, -1, 4 * c) # b h/2*w/2 4*c + + x = self.norm(x) + x = self.reduction(x) + + return x + + +class OCAB(nn.Module): + # overlapping cross-attention block + + def __init__( + self, + dim, + input_resolution, + window_size, + overlap_ratio, + num_heads, + qkv_bias=True, + qk_scale=None, + mlp_ratio=2, + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.window_size = window_size + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + self.overlap_win_size = int(window_size * overlap_ratio) + window_size + + self.norm1 = norm_layer(dim) + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.unfold = nn.Unfold( + kernel_size=(self.overlap_win_size, self.overlap_win_size), + stride=window_size, + padding=(self.overlap_win_size - window_size) // 2, + ) + + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( # type: ignore + torch.zeros( + (window_size + self.overlap_win_size - 1) + * (window_size + self.overlap_win_size - 1), + num_heads, + ) + ) # 2*Wh-1 * 2*Ww-1, nH + + trunc_normal_(self.relative_position_bias_table, std=0.02) + self.softmax = nn.Softmax(dim=-1) + + self.proj = nn.Linear(dim, dim) + + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, hidden_features=mlp_hidden_dim, act_layer=nn.GELU + ) + + def forward(self, x, x_size, rpi): + h, w = x_size + b, _, c = x.shape + + shortcut = x + x = self.norm1(x) + x = x.view(b, h, w, c) + + qkv = self.qkv(x).reshape(b, h, w, 3, c).permute(3, 0, 4, 1, 2) # 3, b, c, h, w + q = qkv[0].permute(0, 2, 3, 1) # b, h, w, c + kv = torch.cat((qkv[1], qkv[2]), dim=1) # b, 2*c, h, w + + # partition windows + q_windows = window_partition( + q, self.window_size + ) # nw*b, window_size, window_size, c + q_windows = q_windows.view( + -1, self.window_size * self.window_size, c + ) # nw*b, window_size*window_size, c + + kv_windows = self.unfold(kv) # b, c*w*w, nw + kv_windows = rearrange( + kv_windows, + "b (nc ch owh oww) nw -> nc (b nw) (owh oww) ch", + nc=2, + ch=c, + owh=self.overlap_win_size, + oww=self.overlap_win_size, + ).contiguous() # 2, nw*b, ow*ow, c + # Do the above rearrangement without the rearrange function + # kv_windows = kv_windows.view( + # 2, b, self.overlap_win_size, self.overlap_win_size, c, -1 + # ) + # kv_windows = kv_windows.permute(0, 5, 1, 2, 3, 4).contiguous() + # kv_windows = kv_windows.view( + # 2, -1, self.overlap_win_size * self.overlap_win_size, c + # ) + + k_windows, v_windows = kv_windows[0], kv_windows[1] # nw*b, ow*ow, c + + b_, nq, _ = q_windows.shape + _, n, _ = k_windows.shape + d = self.dim // self.num_heads + q = q_windows.reshape(b_, nq, self.num_heads, d).permute( + 0, 2, 1, 3 + ) # nw*b, nH, nq, d + k = k_windows.reshape(b_, n, self.num_heads, d).permute( + 0, 2, 1, 3 + ) # nw*b, nH, n, d + v = v_windows.reshape(b_, n, self.num_heads, d).permute( + 0, 2, 1, 3 + ) # nw*b, nH, n, d + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + relative_position_bias = self.relative_position_bias_table[rpi.view(-1)].view( + self.window_size * self.window_size, + self.overlap_win_size * self.overlap_win_size, + -1, + ) # ws*ws, wse*wse, nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1 + ).contiguous() # nH, ws*ws, wse*wse + attn = attn + relative_position_bias.unsqueeze(0) + + attn = self.softmax(attn) + attn_windows = (attn @ v).transpose(1, 2).reshape(b_, nq, self.dim) + + # merge windows + attn_windows = attn_windows.view( + -1, self.window_size, self.window_size, self.dim + ) + x = window_reverse(attn_windows, self.window_size, h, w) # b h w c + x = x.view(b, h * w, self.dim) + + x = self.proj(x) + shortcut + + x = x + self.mlp(self.norm2(x)) + return x + + +class AttenBlocks(nn.Module): + """A series of attention blocks for one RHAG. + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + window_size (int): Local window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__( + self, + dim, + input_resolution, + depth, + num_heads, + window_size, + compress_ratio, + squeeze_factor, + conv_scale, + overlap_ratio, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False, + ): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList( + [ + HAB( + dim=dim, + input_resolution=input_resolution, + num_heads=num_heads, + window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + compress_ratio=compress_ratio, + squeeze_factor=squeeze_factor, + conv_scale=conv_scale, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] + if isinstance(drop_path, list) + else drop_path, + norm_layer=norm_layer, + ) + for i in range(depth) + ] + ) + + # OCAB + self.overlap_attn = OCAB( + dim=dim, + input_resolution=input_resolution, + window_size=window_size, + overlap_ratio=overlap_ratio, + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + mlp_ratio=mlp_ratio, # type: ignore + norm_layer=norm_layer, + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample( + input_resolution, dim=dim, norm_layer=norm_layer + ) + else: + self.downsample = None + + def forward(self, x, x_size, params): + for blk in self.blocks: + x = blk(x, x_size, params["rpi_sa"], params["attn_mask"]) + + x = self.overlap_attn(x, x_size, params["rpi_oca"]) + + if self.downsample is not None: + x = self.downsample(x) + return x + + +class RHAG(nn.Module): + """Residual Hybrid Attention Group (RHAG). + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + window_size (int): Local window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + img_size: Input image size. + patch_size: Patch size. + resi_connection: The convolutional block before residual connection. + """ + + def __init__( + self, + dim, + input_resolution, + depth, + num_heads, + window_size, + compress_ratio, + squeeze_factor, + conv_scale, + overlap_ratio, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False, + img_size=224, + patch_size=4, + resi_connection="1conv", + ): + super(RHAG, self).__init__() + + self.dim = dim + self.input_resolution = input_resolution + + self.residual_group = AttenBlocks( + dim=dim, + input_resolution=input_resolution, + depth=depth, + num_heads=num_heads, + window_size=window_size, + compress_ratio=compress_ratio, + squeeze_factor=squeeze_factor, + conv_scale=conv_scale, + overlap_ratio=overlap_ratio, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path, + norm_layer=norm_layer, + downsample=downsample, + use_checkpoint=use_checkpoint, + ) + + if resi_connection == "1conv": + self.conv = nn.Conv2d(dim, dim, 3, 1, 1) + elif resi_connection == "identity": + self.conv = nn.Identity() + + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=0, + embed_dim=dim, + norm_layer=None, + ) + + self.patch_unembed = PatchUnEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=0, + embed_dim=dim, + norm_layer=None, + ) + + def forward(self, x, x_size, params): + return ( + self.patch_embed( + self.conv( + self.patch_unembed(self.residual_group(x, x_size, params), x_size) + ) + ) + + x + ) + + +class PatchEmbed(nn.Module): + r"""Image to Patch Embedding + Args: + img_size (int): Image size. Default: 224. + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__( + self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None + ): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [ + img_size[0] // patch_size[0], # type: ignore + img_size[1] // patch_size[1], # type: ignore + ] + self.img_size = img_size + self.patch_size = patch_size + self.patches_resolution = patches_resolution + self.num_patches = patches_resolution[0] * patches_resolution[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + x = x.flatten(2).transpose(1, 2) # b Ph*Pw c + if self.norm is not None: + x = self.norm(x) + return x + + +class PatchUnEmbed(nn.Module): + r"""Image to Patch Unembedding + Args: + img_size (int): Image size. Default: 224. + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__( + self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None + ): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [ + img_size[0] // patch_size[0], # type: ignore + img_size[1] // patch_size[1], # type: ignore + ] + self.img_size = img_size + self.patch_size = patch_size + self.patches_resolution = patches_resolution + self.num_patches = patches_resolution[0] * patches_resolution[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + def forward(self, x, x_size): + x = ( + x.transpose(1, 2) + .contiguous() + .view(x.shape[0], self.embed_dim, x_size[0], x_size[1]) + ) # b Ph*Pw c + return x + + +class Upsample(nn.Sequential): + """Upsample module. + Args: + scale (int): Scale factor. Supported scales: 2^n and 3. + num_feat (int): Channel number of intermediate features. + """ + + def __init__(self, scale, num_feat): + m = [] + if (scale & (scale - 1)) == 0: # scale = 2^n + for _ in range(int(math.log(scale, 2))): + m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1)) + m.append(nn.PixelShuffle(2)) + elif scale == 3: + m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1)) + m.append(nn.PixelShuffle(3)) + else: + raise ValueError( + f"scale {scale} is not supported. " "Supported scales: 2^n and 3." + ) + super(Upsample, self).__init__(*m) + + +class HAT(nn.Module): + r"""Hybrid Attention Transformer + A PyTorch implementation of : `Activating More Pixels in Image Super-Resolution Transformer`. + Some codes are based on SwinIR. + Args: + img_size (int | tuple(int)): Input image size. Default 64 + patch_size (int | tuple(int)): Patch size. Default: 1 + in_chans (int): Number of input image channels. Default: 3 + embed_dim (int): Patch embedding dimension. Default: 96 + depths (tuple(int)): Depth of each Swin Transformer layer. + num_heads (tuple(int)): Number of attention heads in different layers. + window_size (int): Window size. Default: 7 + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None + drop_rate (float): Dropout rate. Default: 0 + attn_drop_rate (float): Attention dropout rate. Default: 0 + drop_path_rate (float): Stochastic depth rate. Default: 0.1 + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False + patch_norm (bool): If True, add normalization after patch embedding. Default: True + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False + upscale: Upscale factor. 2/3/4/8 for image SR, 1 for denoising and compress artifact reduction + img_range: Image range. 1. or 255. + upsampler: The reconstruction reconstruction module. 'pixelshuffle'/'pixelshuffledirect'/'nearest+conv'/None + resi_connection: The convolutional block before residual connection. '1conv'/'3conv' + """ + + def __init__( + self, + state_dict, + **kwargs, + ): + super(HAT, self).__init__() + + # Defaults + img_size = 64 + patch_size = 1 + in_chans = 3 + embed_dim = 96 + depths = (6, 6, 6, 6) + num_heads = (6, 6, 6, 6) + window_size = 7 + compress_ratio = 3 + squeeze_factor = 30 + conv_scale = 0.01 + overlap_ratio = 0.5 + mlp_ratio = 4.0 + qkv_bias = True + qk_scale = None + drop_rate = 0.0 + attn_drop_rate = 0.0 + drop_path_rate = 0.1 + norm_layer = nn.LayerNorm + ape = False + patch_norm = True + use_checkpoint = False + upscale = 2 + img_range = 1.0 + upsampler = "" + resi_connection = "1conv" + + self.state = state_dict + self.model_arch = "HAT" + self.sub_type = "SR" + self.supports_fp16 = False + self.support_bf16 = True + self.min_size_restriction = 16 + + state_keys = list(state_dict.keys()) + + num_feat = state_dict["conv_last.weight"].shape[1] + in_chans = state_dict["conv_first.weight"].shape[1] + num_out_ch = state_dict["conv_last.weight"].shape[0] + embed_dim = state_dict["conv_first.weight"].shape[0] + + if "conv_before_upsample.0.weight" in state_keys: + if "conv_up1.weight" in state_keys: + upsampler = "nearest+conv" + else: + upsampler = "pixelshuffle" + supports_fp16 = False + elif "upsample.0.weight" in state_keys: + upsampler = "pixelshuffledirect" + else: + upsampler = "" + upscale = 1 + if upsampler == "nearest+conv": + upsample_keys = [ + x for x in state_keys if "conv_up" in x and "bias" not in x + ] + + for upsample_key in upsample_keys: + upscale *= 2 + elif upsampler == "pixelshuffle": + upsample_keys = [ + x + for x in state_keys + if "upsample" in x and "conv" not in x and "bias" not in x + ] + for upsample_key in upsample_keys: + shape = self.state[upsample_key].shape[0] + upscale *= math.sqrt(shape // num_feat) + upscale = int(upscale) + elif upsampler == "pixelshuffledirect": + upscale = int( + math.sqrt(self.state["upsample.0.bias"].shape[0] // num_out_ch) + ) + + max_layer_num = 0 + max_block_num = 0 + for key in state_keys: + result = re.match( + r"layers.(\d*).residual_group.blocks.(\d*).conv_block.cab.0.weight", key + ) + if result: + layer_num, block_num = result.groups() + max_layer_num = max(max_layer_num, int(layer_num)) + max_block_num = max(max_block_num, int(block_num)) + + depths = [max_block_num + 1 for _ in range(max_layer_num + 1)] + + if ( + "layers.0.residual_group.blocks.0.attn.relative_position_bias_table" + in state_keys + ): + num_heads_num = self.state[ + "layers.0.residual_group.blocks.0.attn.relative_position_bias_table" + ].shape[-1] + num_heads = [num_heads_num for _ in range(max_layer_num + 1)] + else: + num_heads = depths + + mlp_ratio = float( + self.state["layers.0.residual_group.blocks.0.mlp.fc1.bias"].shape[0] + / embed_dim + ) + + # TODO: could actually count the layers, but this should do + if "layers.0.conv.4.weight" in state_keys: + resi_connection = "3conv" + else: + resi_connection = "1conv" + + window_size = int(math.sqrt(self.state["relative_position_index_SA"].shape[0])) + + # Not sure if this is needed or used at all anywhere in HAT's config + if "layers.0.residual_group.blocks.1.attn_mask" in state_keys: + img_size = int( + math.sqrt( + self.state["layers.0.residual_group.blocks.1.attn_mask"].shape[0] + ) + * window_size + ) + + self.window_size = window_size + self.shift_size = window_size // 2 + self.overlap_ratio = overlap_ratio + + self.in_nc = in_chans + self.out_nc = num_out_ch + self.num_feat = num_feat + self.embed_dim = embed_dim + self.num_heads = num_heads + self.depths = depths + self.window_size = window_size + self.mlp_ratio = mlp_ratio + self.scale = upscale + self.upsampler = upsampler + self.img_size = img_size + self.img_range = img_range + self.resi_connection = resi_connection + + num_in_ch = in_chans + # num_out_ch = in_chans + # num_feat = 64 + self.img_range = img_range + if in_chans == 3: + rgb_mean = (0.4488, 0.4371, 0.4040) + self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1) + else: + self.mean = torch.zeros(1, 1, 1, 1) + self.upscale = upscale + self.upsampler = upsampler + + # relative position index + relative_position_index_SA = self.calculate_rpi_sa() + relative_position_index_OCA = self.calculate_rpi_oca() + self.register_buffer("relative_position_index_SA", relative_position_index_SA) + self.register_buffer("relative_position_index_OCA", relative_position_index_OCA) + + # ------------------------- 1, shallow feature extraction ------------------------- # + self.conv_first = nn.Conv2d(num_in_ch, embed_dim, 3, 1, 1) + + # ------------------------- 2, deep feature extraction ------------------------- # + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.ape = ape + self.patch_norm = patch_norm + self.num_features = embed_dim + self.mlp_ratio = mlp_ratio + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=embed_dim, + embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None, + ) + num_patches = self.patch_embed.num_patches + patches_resolution = self.patch_embed.patches_resolution + self.patches_resolution = patches_resolution + + # merge non-overlapping patches into image + self.patch_unembed = PatchUnEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=embed_dim, + embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None, + ) + + # absolute position embedding + if self.ape: + self.absolute_pos_embed = nn.Parameter( # type: ignore[arg-type] + torch.zeros(1, num_patches, embed_dim) + ) + trunc_normal_(self.absolute_pos_embed, std=0.02) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) + ] # stochastic depth decay rule + + # build Residual Hybrid Attention Groups (RHAG) + self.layers = nn.ModuleList() + for i_layer in range(self.num_layers): + layer = RHAG( + dim=embed_dim, + input_resolution=(patches_resolution[0], patches_resolution[1]), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + compress_ratio=compress_ratio, + squeeze_factor=squeeze_factor, + conv_scale=conv_scale, + overlap_ratio=overlap_ratio, + mlp_ratio=self.mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[ + sum(depths[:i_layer]) : sum(depths[: i_layer + 1]) # type: ignore + ], # no impact on SR results + norm_layer=norm_layer, + downsample=None, + use_checkpoint=use_checkpoint, + img_size=img_size, + patch_size=patch_size, + resi_connection=resi_connection, + ) + self.layers.append(layer) + self.norm = norm_layer(self.num_features) + + # build the last conv layer in deep feature extraction + if resi_connection == "1conv": + self.conv_after_body = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1) + elif resi_connection == "identity": + self.conv_after_body = nn.Identity() + + # ------------------------- 3, high quality image reconstruction ------------------------- # + if self.upsampler == "pixelshuffle": + # for classical SR + self.conv_before_upsample = nn.Sequential( + nn.Conv2d(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True) + ) + self.upsample = Upsample(upscale, num_feat) + self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1) + + self.apply(self._init_weights) + self.load_state_dict(self.state, strict=False) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def calculate_rpi_sa(self): + # calculate relative position index for SA + coords_h = torch.arange(self.window_size) + coords_w = torch.arange(self.window_size) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = ( + coords_flatten[:, :, None] - coords_flatten[:, None, :] + ) # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute( + 1, 2, 0 + ).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size - 1 + relative_coords[:, :, 0] *= 2 * self.window_size - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + return relative_position_index + + def calculate_rpi_oca(self): + # calculate relative position index for OCA + window_size_ori = self.window_size + window_size_ext = self.window_size + int(self.overlap_ratio * self.window_size) + + coords_h = torch.arange(window_size_ori) + coords_w = torch.arange(window_size_ori) + coords_ori = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, ws, ws + coords_ori_flatten = torch.flatten(coords_ori, 1) # 2, ws*ws + + coords_h = torch.arange(window_size_ext) + coords_w = torch.arange(window_size_ext) + coords_ext = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, wse, wse + coords_ext_flatten = torch.flatten(coords_ext, 1) # 2, wse*wse + + relative_coords = ( + coords_ext_flatten[:, None, :] - coords_ori_flatten[:, :, None] + ) # 2, ws*ws, wse*wse + + relative_coords = relative_coords.permute( + 1, 2, 0 + ).contiguous() # ws*ws, wse*wse, 2 + relative_coords[:, :, 0] += ( + window_size_ori - window_size_ext + 1 + ) # shift to start from 0 + relative_coords[:, :, 1] += window_size_ori - window_size_ext + 1 + + relative_coords[:, :, 0] *= window_size_ori + window_size_ext - 1 + relative_position_index = relative_coords.sum(-1) + return relative_position_index + + def calculate_mask(self, x_size): + # calculate attention mask for SW-MSA + h, w = x_size + img_mask = torch.zeros((1, h, w, 1)) # 1 h w 1 + h_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + w_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition( + img_mask, self.window_size + ) # nw, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( + attn_mask == 0, float(0.0) + ) + + return attn_mask + + @torch.jit.ignore # type: ignore + def no_weight_decay(self): + return {"absolute_pos_embed"} + + @torch.jit.ignore # type: ignore + def no_weight_decay_keywords(self): + return {"relative_position_bias_table"} + + def check_image_size(self, x): + _, _, h, w = x.size() + mod_pad_h = (self.window_size - h % self.window_size) % self.window_size + mod_pad_w = (self.window_size - w % self.window_size) % self.window_size + x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h), "reflect") + return x + + def forward_features(self, x): + x_size = (x.shape[2], x.shape[3]) + + # Calculate attention mask and relative position index in advance to speed up inference. + # The original code is very time-cosuming for large window size. + attn_mask = self.calculate_mask(x_size).to(x.device) + params = { + "attn_mask": attn_mask, + "rpi_sa": self.relative_position_index_SA, + "rpi_oca": self.relative_position_index_OCA, + } + + x = self.patch_embed(x) + if self.ape: + x = x + self.absolute_pos_embed + x = self.pos_drop(x) + + for layer in self.layers: + x = layer(x, x_size, params) + + x = self.norm(x) # b seq_len c + x = self.patch_unembed(x, x_size) + + return x + + def forward(self, x): + H, W = x.shape[2:] + self.mean = self.mean.type_as(x) + x = (x - self.mean) * self.img_range + x = self.check_image_size(x) + + if self.upsampler == "pixelshuffle": + # for classical SR + x = self.conv_first(x) + x = self.conv_after_body(self.forward_features(x)) + x + x = self.conv_before_upsample(x) + x = self.conv_last(self.upsample(x)) + + x = x / self.img_range + self.mean + + return x[:, :, : H * self.upscale, : W * self.upscale] diff --git a/ldm_patched/pfn/architecture/LICENSE-DAT b/ldm_patched/pfn/architecture/LICENSE-DAT new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/ldm_patched/pfn/architecture/LICENSE-DAT @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/ldm_patched/pfn/architecture/LICENSE-ESRGAN b/ldm_patched/pfn/architecture/LICENSE-ESRGAN new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/ldm_patched/pfn/architecture/LICENSE-ESRGAN @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/ldm_patched/pfn/architecture/LICENSE-HAT b/ldm_patched/pfn/architecture/LICENSE-HAT new file mode 100644 index 0000000000000000000000000000000000000000..003e97e96cbed07d07b5ff15831711181607edb3 --- /dev/null +++ b/ldm_patched/pfn/architecture/LICENSE-HAT @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Xiangyu Chen + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/ldm_patched/pfn/architecture/LICENSE-RealESRGAN b/ldm_patched/pfn/architecture/LICENSE-RealESRGAN new file mode 100644 index 0000000000000000000000000000000000000000..552a1eeaf01f4e7077013ed3496600c608f35202 --- /dev/null +++ b/ldm_patched/pfn/architecture/LICENSE-RealESRGAN @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2021, Xintao Wang +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/ldm_patched/pfn/architecture/LICENSE-SCUNet b/ldm_patched/pfn/architecture/LICENSE-SCUNet new file mode 100644 index 0000000000000000000000000000000000000000..ff75c988f3482ab21da41f0d10068108be54ad88 --- /dev/null +++ b/ldm_patched/pfn/architecture/LICENSE-SCUNet @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2022 Kai Zhang (cskaizhang@gmail.com, https://cszn.github.io/). All rights reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/ldm_patched/pfn/architecture/LICENSE-SPSR b/ldm_patched/pfn/architecture/LICENSE-SPSR new file mode 100644 index 0000000000000000000000000000000000000000..3245f3f9e4f476ee3a283f41dd0d9db65544c222 --- /dev/null +++ b/ldm_patched/pfn/architecture/LICENSE-SPSR @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2018-2022 BasicSR Authors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/ldm_patched/pfn/architecture/LICENSE-SwiftSRGAN b/ldm_patched/pfn/architecture/LICENSE-SwiftSRGAN new file mode 100644 index 0000000000000000000000000000000000000000..0e259d42c996742e9e3cba14c677129b2c1b6311 --- /dev/null +++ b/ldm_patched/pfn/architecture/LICENSE-SwiftSRGAN @@ -0,0 +1,121 @@ +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. diff --git a/ldm_patched/pfn/architecture/LICENSE-Swin2SR b/ldm_patched/pfn/architecture/LICENSE-Swin2SR new file mode 100644 index 0000000000000000000000000000000000000000..e5e4ee061a3f3fbad64bc837425716af7fb108f5 --- /dev/null +++ b/ldm_patched/pfn/architecture/LICENSE-Swin2SR @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [2021] [SwinIR Authors] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/ldm_patched/pfn/architecture/LICENSE-SwinIR b/ldm_patched/pfn/architecture/LICENSE-SwinIR new file mode 100644 index 0000000000000000000000000000000000000000..e5e4ee061a3f3fbad64bc837425716af7fb108f5 --- /dev/null +++ b/ldm_patched/pfn/architecture/LICENSE-SwinIR @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [2021] [SwinIR Authors] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/ldm_patched/pfn/architecture/LICENSE-lama b/ldm_patched/pfn/architecture/LICENSE-lama new file mode 100644 index 0000000000000000000000000000000000000000..ca822bb5f62a37a5a73f56a2d563b16dab46c03f --- /dev/null +++ b/ldm_patched/pfn/architecture/LICENSE-lama @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [2021] Samsung Research + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/ldm_patched/pfn/architecture/LaMa.py b/ldm_patched/pfn/architecture/LaMa.py new file mode 100644 index 0000000000000000000000000000000000000000..a781f3e4dda789c06493fcf35a9803ee61efce73 --- /dev/null +++ b/ldm_patched/pfn/architecture/LaMa.py @@ -0,0 +1,694 @@ +# pylint: skip-file +""" +Model adapted from advimman's lama project: https://github.com/advimman/lama +""" + +# Fast Fourier Convolution NeurIPS 2020 +# original implementation https://github.com/pkumivision/FFC/blob/main/model_zoo/ffc.py +# paper https://proceedings.neurips.cc/paper/2020/file/2fd5d41ec6cfab47e32164d5624269b1-Paper.pdf + +from typing import List + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torchvision.transforms.functional import InterpolationMode, rotate + + +class LearnableSpatialTransformWrapper(nn.Module): + def __init__(self, impl, pad_coef=0.5, angle_init_range=80, train_angle=True): + super().__init__() + self.impl = impl + self.angle = torch.rand(1) * angle_init_range + if train_angle: + self.angle = nn.Parameter(self.angle, requires_grad=True) + self.pad_coef = pad_coef + + def forward(self, x): + if torch.is_tensor(x): + return self.inverse_transform(self.impl(self.transform(x)), x) + elif isinstance(x, tuple): + x_trans = tuple(self.transform(elem) for elem in x) + y_trans = self.impl(x_trans) + return tuple( + self.inverse_transform(elem, orig_x) for elem, orig_x in zip(y_trans, x) + ) + else: + raise ValueError(f"Unexpected input type {type(x)}") + + def transform(self, x): + height, width = x.shape[2:] + pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef) + x_padded = F.pad(x, [pad_w, pad_w, pad_h, pad_h], mode="reflect") + x_padded_rotated = rotate( + x_padded, self.angle.to(x_padded), InterpolationMode.BILINEAR, fill=0 + ) + + return x_padded_rotated + + def inverse_transform(self, y_padded_rotated, orig_x): + height, width = orig_x.shape[2:] + pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef) + + y_padded = rotate( + y_padded_rotated, + -self.angle.to(y_padded_rotated), + InterpolationMode.BILINEAR, + fill=0, + ) + y_height, y_width = y_padded.shape[2:] + y = y_padded[:, :, pad_h : y_height - pad_h, pad_w : y_width - pad_w] + return y + + +class SELayer(nn.Module): + def __init__(self, channel, reduction=16): + super(SELayer, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Sequential( + nn.Linear(channel, channel // reduction, bias=False), + nn.ReLU(inplace=True), + nn.Linear(channel // reduction, channel, bias=False), + nn.Sigmoid(), + ) + + def forward(self, x): + b, c, _, _ = x.size() + y = self.avg_pool(x).view(b, c) + y = self.fc(y).view(b, c, 1, 1) + res = x * y.expand_as(x) + return res + + +class FourierUnit(nn.Module): + def __init__( + self, + in_channels, + out_channels, + groups=1, + spatial_scale_factor=None, + spatial_scale_mode="bilinear", + spectral_pos_encoding=False, + use_se=False, + se_kwargs=None, + ffc3d=False, + fft_norm="ortho", + ): + # bn_layer not used + super(FourierUnit, self).__init__() + self.groups = groups + + self.conv_layer = torch.nn.Conv2d( + in_channels=in_channels * 2 + (2 if spectral_pos_encoding else 0), + out_channels=out_channels * 2, + kernel_size=1, + stride=1, + padding=0, + groups=self.groups, + bias=False, + ) + self.bn = torch.nn.BatchNorm2d(out_channels * 2) + self.relu = torch.nn.ReLU(inplace=True) + + # squeeze and excitation block + self.use_se = use_se + if use_se: + if se_kwargs is None: + se_kwargs = {} + self.se = SELayer(self.conv_layer.in_channels, **se_kwargs) + + self.spatial_scale_factor = spatial_scale_factor + self.spatial_scale_mode = spatial_scale_mode + self.spectral_pos_encoding = spectral_pos_encoding + self.ffc3d = ffc3d + self.fft_norm = fft_norm + + def forward(self, x): + half_check = False + if x.type() == "torch.cuda.HalfTensor": + # half only works on gpu anyway + half_check = True + + batch = x.shape[0] + + if self.spatial_scale_factor is not None: + orig_size = x.shape[-2:] + x = F.interpolate( + x, + scale_factor=self.spatial_scale_factor, + mode=self.spatial_scale_mode, + align_corners=False, + ) + + # (batch, c, h, w/2+1, 2) + fft_dim = (-3, -2, -1) if self.ffc3d else (-2, -1) + if half_check == True: + ffted = torch.fft.rfftn( + x.float(), dim=fft_dim, norm=self.fft_norm + ) # .type(torch.cuda.HalfTensor) + else: + ffted = torch.fft.rfftn(x, dim=fft_dim, norm=self.fft_norm) + + ffted = torch.stack((ffted.real, ffted.imag), dim=-1) + ffted = ffted.permute(0, 1, 4, 2, 3).contiguous() # (batch, c, 2, h, w/2+1) + ffted = ffted.view( + ( + batch, + -1, + ) + + ffted.size()[3:] + ) + + if self.spectral_pos_encoding: + height, width = ffted.shape[-2:] + coords_vert = ( + torch.linspace(0, 1, height)[None, None, :, None] + .expand(batch, 1, height, width) + .to(ffted) + ) + coords_hor = ( + torch.linspace(0, 1, width)[None, None, None, :] + .expand(batch, 1, height, width) + .to(ffted) + ) + ffted = torch.cat((coords_vert, coords_hor, ffted), dim=1) + + if self.use_se: + ffted = self.se(ffted) + + if half_check == True: + ffted = self.conv_layer(ffted.half()) # (batch, c*2, h, w/2+1) + else: + ffted = self.conv_layer( + ffted + ) # .type(torch.cuda.FloatTensor) # (batch, c*2, h, w/2+1) + + ffted = self.relu(self.bn(ffted)) + # forcing to be always float + ffted = ffted.float() + + ffted = ( + ffted.view( + ( + batch, + -1, + 2, + ) + + ffted.size()[2:] + ) + .permute(0, 1, 3, 4, 2) + .contiguous() + ) # (batch,c, t, h, w/2+1, 2) + + ffted = torch.complex(ffted[..., 0], ffted[..., 1]) + + ifft_shape_slice = x.shape[-3:] if self.ffc3d else x.shape[-2:] + output = torch.fft.irfftn( + ffted, s=ifft_shape_slice, dim=fft_dim, norm=self.fft_norm + ) + + if half_check == True: + output = output.half() + + if self.spatial_scale_factor is not None: + output = F.interpolate( + output, + size=orig_size, + mode=self.spatial_scale_mode, + align_corners=False, + ) + + return output + + +class SpectralTransform(nn.Module): + def __init__( + self, + in_channels, + out_channels, + stride=1, + groups=1, + enable_lfu=True, + separable_fu=False, + **fu_kwargs, + ): + # bn_layer not used + super(SpectralTransform, self).__init__() + self.enable_lfu = enable_lfu + if stride == 2: + self.downsample = nn.AvgPool2d(kernel_size=(2, 2), stride=2) + else: + self.downsample = nn.Identity() + + self.stride = stride + self.conv1 = nn.Sequential( + nn.Conv2d( + in_channels, out_channels // 2, kernel_size=1, groups=groups, bias=False + ), + nn.BatchNorm2d(out_channels // 2), + nn.ReLU(inplace=True), + ) + fu_class = FourierUnit + self.fu = fu_class(out_channels // 2, out_channels // 2, groups, **fu_kwargs) + if self.enable_lfu: + self.lfu = fu_class(out_channels // 2, out_channels // 2, groups) + self.conv2 = torch.nn.Conv2d( + out_channels // 2, out_channels, kernel_size=1, groups=groups, bias=False + ) + + def forward(self, x): + x = self.downsample(x) + x = self.conv1(x) + output = self.fu(x) + + if self.enable_lfu: + _, c, h, _ = x.shape + split_no = 2 + split_s = h // split_no + xs = torch.cat( + torch.split(x[:, : c // 4], split_s, dim=-2), dim=1 + ).contiguous() + xs = torch.cat(torch.split(xs, split_s, dim=-1), dim=1).contiguous() + xs = self.lfu(xs) + xs = xs.repeat(1, 1, split_no, split_no).contiguous() + else: + xs = 0 + + output = self.conv2(x + output + xs) + + return output + + +class FFC(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + ratio_gin, + ratio_gout, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False, + enable_lfu=True, + padding_type="reflect", + gated=False, + **spectral_kwargs, + ): + super(FFC, self).__init__() + + assert stride == 1 or stride == 2, "Stride should be 1 or 2." + self.stride = stride + + in_cg = int(in_channels * ratio_gin) + in_cl = in_channels - in_cg + out_cg = int(out_channels * ratio_gout) + out_cl = out_channels - out_cg + # groups_g = 1 if groups == 1 else int(groups * ratio_gout) + # groups_l = 1 if groups == 1 else groups - groups_g + + self.ratio_gin = ratio_gin + self.ratio_gout = ratio_gout + self.global_in_num = in_cg + + module = nn.Identity if in_cl == 0 or out_cl == 0 else nn.Conv2d + self.convl2l = module( + in_cl, + out_cl, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + padding_mode=padding_type, + ) + module = nn.Identity if in_cl == 0 or out_cg == 0 else nn.Conv2d + self.convl2g = module( + in_cl, + out_cg, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + padding_mode=padding_type, + ) + module = nn.Identity if in_cg == 0 or out_cl == 0 else nn.Conv2d + self.convg2l = module( + in_cg, + out_cl, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + padding_mode=padding_type, + ) + module = nn.Identity if in_cg == 0 or out_cg == 0 else SpectralTransform + self.convg2g = module( + in_cg, + out_cg, + stride, + 1 if groups == 1 else groups // 2, + enable_lfu, + **spectral_kwargs, + ) + + self.gated = gated + module = ( + nn.Identity if in_cg == 0 or out_cl == 0 or not self.gated else nn.Conv2d + ) + self.gate = module(in_channels, 2, 1) + + def forward(self, x): + x_l, x_g = x if type(x) is tuple else (x, 0) + out_xl, out_xg = 0, 0 + + if self.gated: + total_input_parts = [x_l] + if torch.is_tensor(x_g): + total_input_parts.append(x_g) + total_input = torch.cat(total_input_parts, dim=1) + + gates = torch.sigmoid(self.gate(total_input)) + g2l_gate, l2g_gate = gates.chunk(2, dim=1) + else: + g2l_gate, l2g_gate = 1, 1 + + if self.ratio_gout != 1: + out_xl = self.convl2l(x_l) + self.convg2l(x_g) * g2l_gate + if self.ratio_gout != 0: + out_xg = self.convl2g(x_l) * l2g_gate + self.convg2g(x_g) + + return out_xl, out_xg + + +class FFC_BN_ACT(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + ratio_gin, + ratio_gout, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False, + norm_layer=nn.BatchNorm2d, + activation_layer=nn.Identity, + padding_type="reflect", + enable_lfu=True, + **kwargs, + ): + super(FFC_BN_ACT, self).__init__() + self.ffc = FFC( + in_channels, + out_channels, + kernel_size, + ratio_gin, + ratio_gout, + stride, + padding, + dilation, + groups, + bias, + enable_lfu, + padding_type=padding_type, + **kwargs, + ) + lnorm = nn.Identity if ratio_gout == 1 else norm_layer + gnorm = nn.Identity if ratio_gout == 0 else norm_layer + global_channels = int(out_channels * ratio_gout) + self.bn_l = lnorm(out_channels - global_channels) + self.bn_g = gnorm(global_channels) + + lact = nn.Identity if ratio_gout == 1 else activation_layer + gact = nn.Identity if ratio_gout == 0 else activation_layer + self.act_l = lact(inplace=True) + self.act_g = gact(inplace=True) + + def forward(self, x): + x_l, x_g = self.ffc(x) + x_l = self.act_l(self.bn_l(x_l)) + x_g = self.act_g(self.bn_g(x_g)) + return x_l, x_g + + +class FFCResnetBlock(nn.Module): + def __init__( + self, + dim, + padding_type, + norm_layer, + activation_layer=nn.ReLU, + dilation=1, + spatial_transform_kwargs=None, + inline=False, + **conv_kwargs, + ): + super().__init__() + self.conv1 = FFC_BN_ACT( + dim, + dim, + kernel_size=3, + padding=dilation, + dilation=dilation, + norm_layer=norm_layer, + activation_layer=activation_layer, + padding_type=padding_type, + **conv_kwargs, + ) + self.conv2 = FFC_BN_ACT( + dim, + dim, + kernel_size=3, + padding=dilation, + dilation=dilation, + norm_layer=norm_layer, + activation_layer=activation_layer, + padding_type=padding_type, + **conv_kwargs, + ) + if spatial_transform_kwargs is not None: + self.conv1 = LearnableSpatialTransformWrapper( + self.conv1, **spatial_transform_kwargs + ) + self.conv2 = LearnableSpatialTransformWrapper( + self.conv2, **spatial_transform_kwargs + ) + self.inline = inline + + def forward(self, x): + if self.inline: + x_l, x_g = ( + x[:, : -self.conv1.ffc.global_in_num], + x[:, -self.conv1.ffc.global_in_num :], + ) + else: + x_l, x_g = x if type(x) is tuple else (x, 0) + + id_l, id_g = x_l, x_g + + x_l, x_g = self.conv1((x_l, x_g)) + x_l, x_g = self.conv2((x_l, x_g)) + + x_l, x_g = id_l + x_l, id_g + x_g + out = x_l, x_g + if self.inline: + out = torch.cat(out, dim=1) + return out + + +class ConcatTupleLayer(nn.Module): + def forward(self, x): + assert isinstance(x, tuple) + x_l, x_g = x + assert torch.is_tensor(x_l) or torch.is_tensor(x_g) + if not torch.is_tensor(x_g): + return x_l + return torch.cat(x, dim=1) + + +class FFCResNetGenerator(nn.Module): + def __init__( + self, + input_nc, + output_nc, + ngf=64, + n_downsampling=3, + n_blocks=18, + norm_layer=nn.BatchNorm2d, + padding_type="reflect", + activation_layer=nn.ReLU, + up_norm_layer=nn.BatchNorm2d, + up_activation=nn.ReLU(True), + init_conv_kwargs={}, + downsample_conv_kwargs={}, + resnet_conv_kwargs={}, + spatial_transform_layers=None, + spatial_transform_kwargs={}, + max_features=1024, + out_ffc=False, + out_ffc_kwargs={}, + ): + assert n_blocks >= 0 + super().__init__() + """ + init_conv_kwargs = {'ratio_gin': 0, 'ratio_gout': 0, 'enable_lfu': False} + downsample_conv_kwargs = {'ratio_gin': '${generator.init_conv_kwargs.ratio_gout}', 'ratio_gout': '${generator.downsample_conv_kwargs.ratio_gin}', 'enable_lfu': False} + resnet_conv_kwargs = {'ratio_gin': 0.75, 'ratio_gout': '${generator.resnet_conv_kwargs.ratio_gin}', 'enable_lfu': False} + spatial_transform_kwargs = {} + out_ffc_kwargs = {} + """ + """ + print(input_nc, output_nc, ngf, n_downsampling, n_blocks, norm_layer, + padding_type, activation_layer, + up_norm_layer, up_activation, + spatial_transform_layers, + add_out_act, max_features, out_ffc, file=sys.stderr) + + 4 3 64 3 18+ reflect + + ReLU(inplace=True) + None sigmoid 1024 False + """ + init_conv_kwargs = {"ratio_gin": 0, "ratio_gout": 0, "enable_lfu": False} + downsample_conv_kwargs = {"ratio_gin": 0, "ratio_gout": 0, "enable_lfu": False} + resnet_conv_kwargs = { + "ratio_gin": 0.75, + "ratio_gout": 0.75, + "enable_lfu": False, + } + spatial_transform_kwargs = {} + out_ffc_kwargs = {} + + model = [ + nn.ReflectionPad2d(3), + FFC_BN_ACT( + input_nc, + ngf, + kernel_size=7, + padding=0, + norm_layer=norm_layer, + activation_layer=activation_layer, + **init_conv_kwargs, + ), + ] + + ### downsample + for i in range(n_downsampling): + mult = 2**i + if i == n_downsampling - 1: + cur_conv_kwargs = dict(downsample_conv_kwargs) + cur_conv_kwargs["ratio_gout"] = resnet_conv_kwargs.get("ratio_gin", 0) + else: + cur_conv_kwargs = downsample_conv_kwargs + model += [ + FFC_BN_ACT( + min(max_features, ngf * mult), + min(max_features, ngf * mult * 2), + kernel_size=3, + stride=2, + padding=1, + norm_layer=norm_layer, + activation_layer=activation_layer, + **cur_conv_kwargs, + ) + ] + + mult = 2**n_downsampling + feats_num_bottleneck = min(max_features, ngf * mult) + + ### resnet blocks + for i in range(n_blocks): + cur_resblock = FFCResnetBlock( + feats_num_bottleneck, + padding_type=padding_type, + activation_layer=activation_layer, + norm_layer=norm_layer, + **resnet_conv_kwargs, + ) + if spatial_transform_layers is not None and i in spatial_transform_layers: + cur_resblock = LearnableSpatialTransformWrapper( + cur_resblock, **spatial_transform_kwargs + ) + model += [cur_resblock] + + model += [ConcatTupleLayer()] + + ### upsample + for i in range(n_downsampling): + mult = 2 ** (n_downsampling - i) + model += [ + nn.ConvTranspose2d( + min(max_features, ngf * mult), + min(max_features, int(ngf * mult / 2)), + kernel_size=3, + stride=2, + padding=1, + output_padding=1, + ), + up_norm_layer(min(max_features, int(ngf * mult / 2))), + up_activation, + ] + + if out_ffc: + model += [ + FFCResnetBlock( + ngf, + padding_type=padding_type, + activation_layer=activation_layer, + norm_layer=norm_layer, + inline=True, + **out_ffc_kwargs, + ) + ] + + model += [ + nn.ReflectionPad2d(3), + nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0), + ] + model.append(nn.Sigmoid()) + self.model = nn.Sequential(*model) + + def forward(self, image, mask): + return self.model(torch.cat([image, mask], dim=1)) + + +class LaMa(nn.Module): + def __init__(self, state_dict) -> None: + super(LaMa, self).__init__() + self.model_arch = "LaMa" + self.sub_type = "Inpaint" + self.in_nc = 4 + self.out_nc = 3 + self.scale = 1 + + self.min_size = None + self.pad_mod = 8 + self.pad_to_square = False + + self.model = FFCResNetGenerator(self.in_nc, self.out_nc) + self.state = { + k.replace("generator.model", "model.model"): v + for k, v in state_dict.items() + } + + self.supports_fp16 = False + self.support_bf16 = True + + self.load_state_dict(self.state, strict=False) + + def forward(self, img, mask): + masked_img = img * (1 - mask) + inpainted_mask = mask * self.model.forward(masked_img, mask) + result = inpainted_mask + (1 - mask) * img + return result diff --git a/ldm_patched/pfn/architecture/OmniSR/ChannelAttention.py b/ldm_patched/pfn/architecture/OmniSR/ChannelAttention.py new file mode 100644 index 0000000000000000000000000000000000000000..f4d52aa1e063d274b7aec7bd1ace77b19eb2ca61 --- /dev/null +++ b/ldm_patched/pfn/architecture/OmniSR/ChannelAttention.py @@ -0,0 +1,110 @@ +import math + +import torch.nn as nn + + +class CA_layer(nn.Module): + def __init__(self, channel, reduction=16): + super(CA_layer, self).__init__() + # global average pooling + self.gap = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Sequential( + nn.Conv2d(channel, channel // reduction, kernel_size=(1, 1), bias=False), + nn.GELU(), + nn.Conv2d(channel // reduction, channel, kernel_size=(1, 1), bias=False), + # nn.Sigmoid() + ) + + def forward(self, x): + y = self.fc(self.gap(x)) + return x * y.expand_as(x) + + +class Simple_CA_layer(nn.Module): + def __init__(self, channel): + super(Simple_CA_layer, self).__init__() + self.gap = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Conv2d( + in_channels=channel, + out_channels=channel, + kernel_size=1, + padding=0, + stride=1, + groups=1, + bias=True, + ) + + def forward(self, x): + return x * self.fc(self.gap(x)) + + +class ECA_layer(nn.Module): + """Constructs a ECA module. + Args: + channel: Number of channels of the input feature map + k_size: Adaptive selection of kernel size + """ + + def __init__(self, channel): + super(ECA_layer, self).__init__() + + b = 1 + gamma = 2 + k_size = int(abs(math.log(channel, 2) + b) / gamma) + k_size = k_size if k_size % 2 else k_size + 1 + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.conv = nn.Conv1d( + 1, 1, kernel_size=k_size, padding=(k_size - 1) // 2, bias=False + ) + # self.sigmoid = nn.Sigmoid() + + def forward(self, x): + # x: input features with shape [b, c, h, w] + # b, c, h, w = x.size() + + # feature descriptor on the global spatial information + y = self.avg_pool(x) + + # Two different branches of ECA module + y = self.conv(y.squeeze(-1).transpose(-1, -2)).transpose(-1, -2).unsqueeze(-1) + + # Multi-scale information fusion + # y = self.sigmoid(y) + + return x * y.expand_as(x) + + +class ECA_MaxPool_layer(nn.Module): + """Constructs a ECA module. + Args: + channel: Number of channels of the input feature map + k_size: Adaptive selection of kernel size + """ + + def __init__(self, channel): + super(ECA_MaxPool_layer, self).__init__() + + b = 1 + gamma = 2 + k_size = int(abs(math.log(channel, 2) + b) / gamma) + k_size = k_size if k_size % 2 else k_size + 1 + self.max_pool = nn.AdaptiveMaxPool2d(1) + self.conv = nn.Conv1d( + 1, 1, kernel_size=k_size, padding=(k_size - 1) // 2, bias=False + ) + # self.sigmoid = nn.Sigmoid() + + def forward(self, x): + # x: input features with shape [b, c, h, w] + # b, c, h, w = x.size() + + # feature descriptor on the global spatial information + y = self.max_pool(x) + + # Two different branches of ECA module + y = self.conv(y.squeeze(-1).transpose(-1, -2)).transpose(-1, -2).unsqueeze(-1) + + # Multi-scale information fusion + # y = self.sigmoid(y) + + return x * y.expand_as(x) diff --git a/ldm_patched/pfn/architecture/OmniSR/LICENSE b/ldm_patched/pfn/architecture/OmniSR/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/ldm_patched/pfn/architecture/OmniSR/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/ldm_patched/pfn/architecture/OmniSR/OSA.py b/ldm_patched/pfn/architecture/OmniSR/OSA.py new file mode 100644 index 0000000000000000000000000000000000000000..d7a129696b254b022fa6fc54dc85befcc19ffc2c --- /dev/null +++ b/ldm_patched/pfn/architecture/OmniSR/OSA.py @@ -0,0 +1,577 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################# +# File: OSA.py +# Created Date: Tuesday April 28th 2022 +# Author: Chen Xuanhong +# Email: chenxuanhongzju@outlook.com +# Last Modified: Sunday, 23rd April 2023 3:07:42 pm +# Modified By: Chen Xuanhong +# Copyright (c) 2020 Shanghai Jiao Tong University +############################################################# + +import torch +import torch.nn.functional as F +from einops import rearrange, repeat +from einops.layers.torch import Rearrange, Reduce +from torch import einsum, nn + +from .layernorm import LayerNorm2d + +# helpers + + +def exists(val): + return val is not None + + +def default(val, d): + return val if exists(val) else d + + +def cast_tuple(val, length=1): + return val if isinstance(val, tuple) else ((val,) * length) + + +# helper classes + + +class PreNormResidual(nn.Module): + def __init__(self, dim, fn): + super().__init__() + self.norm = nn.LayerNorm(dim) + self.fn = fn + + def forward(self, x): + return self.fn(self.norm(x)) + x + + +class Conv_PreNormResidual(nn.Module): + def __init__(self, dim, fn): + super().__init__() + self.norm = LayerNorm2d(dim) + self.fn = fn + + def forward(self, x): + return self.fn(self.norm(x)) + x + + +class FeedForward(nn.Module): + def __init__(self, dim, mult=2, dropout=0.0): + super().__init__() + inner_dim = int(dim * mult) + self.net = nn.Sequential( + nn.Linear(dim, inner_dim), + nn.GELU(), + nn.Dropout(dropout), + nn.Linear(inner_dim, dim), + nn.Dropout(dropout), + ) + + def forward(self, x): + return self.net(x) + + +class Conv_FeedForward(nn.Module): + def __init__(self, dim, mult=2, dropout=0.0): + super().__init__() + inner_dim = int(dim * mult) + self.net = nn.Sequential( + nn.Conv2d(dim, inner_dim, 1, 1, 0), + nn.GELU(), + nn.Dropout(dropout), + nn.Conv2d(inner_dim, dim, 1, 1, 0), + nn.Dropout(dropout), + ) + + def forward(self, x): + return self.net(x) + + +class Gated_Conv_FeedForward(nn.Module): + def __init__(self, dim, mult=1, bias=False, dropout=0.0): + super().__init__() + + hidden_features = int(dim * mult) + + self.project_in = nn.Conv2d(dim, hidden_features * 2, kernel_size=1, bias=bias) + + self.dwconv = nn.Conv2d( + hidden_features * 2, + hidden_features * 2, + kernel_size=3, + stride=1, + padding=1, + groups=hidden_features * 2, + bias=bias, + ) + + self.project_out = nn.Conv2d(hidden_features, dim, kernel_size=1, bias=bias) + + def forward(self, x): + x = self.project_in(x) + x1, x2 = self.dwconv(x).chunk(2, dim=1) + x = F.gelu(x1) * x2 + x = self.project_out(x) + return x + + +# MBConv + + +class SqueezeExcitation(nn.Module): + def __init__(self, dim, shrinkage_rate=0.25): + super().__init__() + hidden_dim = int(dim * shrinkage_rate) + + self.gate = nn.Sequential( + Reduce("b c h w -> b c", "mean"), + nn.Linear(dim, hidden_dim, bias=False), + nn.SiLU(), + nn.Linear(hidden_dim, dim, bias=False), + nn.Sigmoid(), + Rearrange("b c -> b c 1 1"), + ) + + def forward(self, x): + return x * self.gate(x) + + +class MBConvResidual(nn.Module): + def __init__(self, fn, dropout=0.0): + super().__init__() + self.fn = fn + self.dropsample = Dropsample(dropout) + + def forward(self, x): + out = self.fn(x) + out = self.dropsample(out) + return out + x + + +class Dropsample(nn.Module): + def __init__(self, prob=0): + super().__init__() + self.prob = prob + + def forward(self, x): + device = x.device + + if self.prob == 0.0 or (not self.training): + return x + + keep_mask = ( + torch.FloatTensor((x.shape[0], 1, 1, 1), device=device).uniform_() + > self.prob + ) + return x * keep_mask / (1 - self.prob) + + +def MBConv( + dim_in, dim_out, *, downsample, expansion_rate=4, shrinkage_rate=0.25, dropout=0.0 +): + hidden_dim = int(expansion_rate * dim_out) + stride = 2 if downsample else 1 + + net = nn.Sequential( + nn.Conv2d(dim_in, hidden_dim, 1), + # nn.BatchNorm2d(hidden_dim), + nn.GELU(), + nn.Conv2d( + hidden_dim, hidden_dim, 3, stride=stride, padding=1, groups=hidden_dim + ), + # nn.BatchNorm2d(hidden_dim), + nn.GELU(), + SqueezeExcitation(hidden_dim, shrinkage_rate=shrinkage_rate), + nn.Conv2d(hidden_dim, dim_out, 1), + # nn.BatchNorm2d(dim_out) + ) + + if dim_in == dim_out and not downsample: + net = MBConvResidual(net, dropout=dropout) + + return net + + +# attention related classes +class Attention(nn.Module): + def __init__( + self, + dim, + dim_head=32, + dropout=0.0, + window_size=7, + with_pe=True, + ): + super().__init__() + assert ( + dim % dim_head + ) == 0, "dimension should be divisible by dimension per head" + + self.heads = dim // dim_head + self.scale = dim_head**-0.5 + self.with_pe = with_pe + + self.to_qkv = nn.Linear(dim, dim * 3, bias=False) + + self.attend = nn.Sequential(nn.Softmax(dim=-1), nn.Dropout(dropout)) + + self.to_out = nn.Sequential( + nn.Linear(dim, dim, bias=False), nn.Dropout(dropout) + ) + + # relative positional bias + if self.with_pe: + self.rel_pos_bias = nn.Embedding((2 * window_size - 1) ** 2, self.heads) + + pos = torch.arange(window_size) + grid = torch.stack(torch.meshgrid(pos, pos)) + grid = rearrange(grid, "c i j -> (i j) c") + rel_pos = rearrange(grid, "i ... -> i 1 ...") - rearrange( + grid, "j ... -> 1 j ..." + ) + rel_pos += window_size - 1 + rel_pos_indices = (rel_pos * torch.tensor([2 * window_size - 1, 1])).sum( + dim=-1 + ) + + self.register_buffer("rel_pos_indices", rel_pos_indices, persistent=False) + + def forward(self, x): + batch, height, width, window_height, window_width, _, device, h = ( + *x.shape, + x.device, + self.heads, + ) + + # flatten + + x = rearrange(x, "b x y w1 w2 d -> (b x y) (w1 w2) d") + + # project for queries, keys, values + + q, k, v = self.to_qkv(x).chunk(3, dim=-1) + + # split heads + + q, k, v = map(lambda t: rearrange(t, "b n (h d ) -> b h n d", h=h), (q, k, v)) + + # scale + + q = q * self.scale + + # sim + + sim = einsum("b h i d, b h j d -> b h i j", q, k) + + # add positional bias + if self.with_pe: + bias = self.rel_pos_bias(self.rel_pos_indices) + sim = sim + rearrange(bias, "i j h -> h i j") + + # attention + + attn = self.attend(sim) + + # aggregate + + out = einsum("b h i j, b h j d -> b h i d", attn, v) + + # merge heads + + out = rearrange( + out, "b h (w1 w2) d -> b w1 w2 (h d)", w1=window_height, w2=window_width + ) + + # combine heads out + + out = self.to_out(out) + return rearrange(out, "(b x y) ... -> b x y ...", x=height, y=width) + + +class Block_Attention(nn.Module): + def __init__( + self, + dim, + dim_head=32, + bias=False, + dropout=0.0, + window_size=7, + with_pe=True, + ): + super().__init__() + assert ( + dim % dim_head + ) == 0, "dimension should be divisible by dimension per head" + + self.heads = dim // dim_head + self.ps = window_size + self.scale = dim_head**-0.5 + self.with_pe = with_pe + + self.qkv = nn.Conv2d(dim, dim * 3, kernel_size=1, bias=bias) + self.qkv_dwconv = nn.Conv2d( + dim * 3, + dim * 3, + kernel_size=3, + stride=1, + padding=1, + groups=dim * 3, + bias=bias, + ) + + self.attend = nn.Sequential(nn.Softmax(dim=-1), nn.Dropout(dropout)) + + self.to_out = nn.Conv2d(dim, dim, kernel_size=1, bias=bias) + + def forward(self, x): + # project for queries, keys, values + b, c, h, w = x.shape + + qkv = self.qkv_dwconv(self.qkv(x)) + q, k, v = qkv.chunk(3, dim=1) + + # split heads + + q, k, v = map( + lambda t: rearrange( + t, + "b (h d) (x w1) (y w2) -> (b x y) h (w1 w2) d", + h=self.heads, + w1=self.ps, + w2=self.ps, + ), + (q, k, v), + ) + + # scale + + q = q * self.scale + + # sim + + sim = einsum("b h i d, b h j d -> b h i j", q, k) + + # attention + attn = self.attend(sim) + + # aggregate + + out = einsum("b h i j, b h j d -> b h i d", attn, v) + + # merge heads + out = rearrange( + out, + "(b x y) head (w1 w2) d -> b (head d) (x w1) (y w2)", + x=h // self.ps, + y=w // self.ps, + head=self.heads, + w1=self.ps, + w2=self.ps, + ) + + out = self.to_out(out) + return out + + +class Channel_Attention(nn.Module): + def __init__(self, dim, heads, bias=False, dropout=0.0, window_size=7): + super(Channel_Attention, self).__init__() + self.heads = heads + + self.temperature = nn.Parameter(torch.ones(heads, 1, 1)) + + self.ps = window_size + + self.qkv = nn.Conv2d(dim, dim * 3, kernel_size=1, bias=bias) + self.qkv_dwconv = nn.Conv2d( + dim * 3, + dim * 3, + kernel_size=3, + stride=1, + padding=1, + groups=dim * 3, + bias=bias, + ) + self.project_out = nn.Conv2d(dim, dim, kernel_size=1, bias=bias) + + def forward(self, x): + b, c, h, w = x.shape + + qkv = self.qkv_dwconv(self.qkv(x)) + qkv = qkv.chunk(3, dim=1) + + q, k, v = map( + lambda t: rearrange( + t, + "b (head d) (h ph) (w pw) -> b (h w) head d (ph pw)", + ph=self.ps, + pw=self.ps, + head=self.heads, + ), + qkv, + ) + + q = F.normalize(q, dim=-1) + k = F.normalize(k, dim=-1) + + attn = (q @ k.transpose(-2, -1)) * self.temperature + attn = attn.softmax(dim=-1) + out = attn @ v + + out = rearrange( + out, + "b (h w) head d (ph pw) -> b (head d) (h ph) (w pw)", + h=h // self.ps, + w=w // self.ps, + ph=self.ps, + pw=self.ps, + head=self.heads, + ) + + out = self.project_out(out) + + return out + + +class Channel_Attention_grid(nn.Module): + def __init__(self, dim, heads, bias=False, dropout=0.0, window_size=7): + super(Channel_Attention_grid, self).__init__() + self.heads = heads + + self.temperature = nn.Parameter(torch.ones(heads, 1, 1)) + + self.ps = window_size + + self.qkv = nn.Conv2d(dim, dim * 3, kernel_size=1, bias=bias) + self.qkv_dwconv = nn.Conv2d( + dim * 3, + dim * 3, + kernel_size=3, + stride=1, + padding=1, + groups=dim * 3, + bias=bias, + ) + self.project_out = nn.Conv2d(dim, dim, kernel_size=1, bias=bias) + + def forward(self, x): + b, c, h, w = x.shape + + qkv = self.qkv_dwconv(self.qkv(x)) + qkv = qkv.chunk(3, dim=1) + + q, k, v = map( + lambda t: rearrange( + t, + "b (head d) (h ph) (w pw) -> b (ph pw) head d (h w)", + ph=self.ps, + pw=self.ps, + head=self.heads, + ), + qkv, + ) + + q = F.normalize(q, dim=-1) + k = F.normalize(k, dim=-1) + + attn = (q @ k.transpose(-2, -1)) * self.temperature + attn = attn.softmax(dim=-1) + out = attn @ v + + out = rearrange( + out, + "b (ph pw) head d (h w) -> b (head d) (h ph) (w pw)", + h=h // self.ps, + w=w // self.ps, + ph=self.ps, + pw=self.ps, + head=self.heads, + ) + + out = self.project_out(out) + + return out + + +class OSA_Block(nn.Module): + def __init__( + self, + channel_num=64, + bias=True, + ffn_bias=True, + window_size=8, + with_pe=False, + dropout=0.0, + ): + super(OSA_Block, self).__init__() + + w = window_size + + self.layer = nn.Sequential( + MBConv( + channel_num, + channel_num, + downsample=False, + expansion_rate=1, + shrinkage_rate=0.25, + ), + Rearrange( + "b d (x w1) (y w2) -> b x y w1 w2 d", w1=w, w2=w + ), # block-like attention + PreNormResidual( + channel_num, + Attention( + dim=channel_num, + dim_head=channel_num // 4, + dropout=dropout, + window_size=window_size, + with_pe=with_pe, + ), + ), + Rearrange("b x y w1 w2 d -> b d (x w1) (y w2)"), + Conv_PreNormResidual( + channel_num, Gated_Conv_FeedForward(dim=channel_num, dropout=dropout) + ), + # channel-like attention + Conv_PreNormResidual( + channel_num, + Channel_Attention( + dim=channel_num, heads=4, dropout=dropout, window_size=window_size + ), + ), + Conv_PreNormResidual( + channel_num, Gated_Conv_FeedForward(dim=channel_num, dropout=dropout) + ), + Rearrange( + "b d (w1 x) (w2 y) -> b x y w1 w2 d", w1=w, w2=w + ), # grid-like attention + PreNormResidual( + channel_num, + Attention( + dim=channel_num, + dim_head=channel_num // 4, + dropout=dropout, + window_size=window_size, + with_pe=with_pe, + ), + ), + Rearrange("b x y w1 w2 d -> b d (w1 x) (w2 y)"), + Conv_PreNormResidual( + channel_num, Gated_Conv_FeedForward(dim=channel_num, dropout=dropout) + ), + # channel-like attention + Conv_PreNormResidual( + channel_num, + Channel_Attention_grid( + dim=channel_num, heads=4, dropout=dropout, window_size=window_size + ), + ), + Conv_PreNormResidual( + channel_num, Gated_Conv_FeedForward(dim=channel_num, dropout=dropout) + ), + ) + + def forward(self, x): + out = self.layer(x) + return out diff --git a/ldm_patched/pfn/architecture/OmniSR/OSAG.py b/ldm_patched/pfn/architecture/OmniSR/OSAG.py new file mode 100644 index 0000000000000000000000000000000000000000..477e81f9da4eb1db9b5ec418549d75dd591209ec --- /dev/null +++ b/ldm_patched/pfn/architecture/OmniSR/OSAG.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################# +# File: OSAG.py +# Created Date: Tuesday April 28th 2022 +# Author: Chen Xuanhong +# Email: chenxuanhongzju@outlook.com +# Last Modified: Sunday, 23rd April 2023 3:08:49 pm +# Modified By: Chen Xuanhong +# Copyright (c) 2020 Shanghai Jiao Tong University +############################################################# + + +import torch.nn as nn + +from .esa import ESA +from .OSA import OSA_Block + + +class OSAG(nn.Module): + def __init__( + self, + channel_num=64, + bias=True, + block_num=4, + ffn_bias=False, + window_size=0, + pe=False, + ): + super(OSAG, self).__init__() + + # print("window_size: %d" % (window_size)) + # print("with_pe", pe) + # print("ffn_bias: %d" % (ffn_bias)) + + # block_script_name = kwargs.get("block_script_name", "OSA") + # block_class_name = kwargs.get("block_class_name", "OSA_Block") + + # script_name = "." + block_script_name + # package = __import__(script_name, fromlist=True) + block_class = OSA_Block # getattr(package, block_class_name) + group_list = [] + for _ in range(block_num): + temp_res = block_class( + channel_num, + bias, + ffn_bias=ffn_bias, + window_size=window_size, + with_pe=pe, + ) + group_list.append(temp_res) + group_list.append(nn.Conv2d(channel_num, channel_num, 1, 1, 0, bias=bias)) + self.residual_layer = nn.Sequential(*group_list) + esa_channel = max(channel_num // 4, 16) + self.esa = ESA(esa_channel, channel_num) + + def forward(self, x): + out = self.residual_layer(x) + out = out + x + return self.esa(out) diff --git a/ldm_patched/pfn/architecture/OmniSR/OmniSR.py b/ldm_patched/pfn/architecture/OmniSR/OmniSR.py new file mode 100644 index 0000000000000000000000000000000000000000..1e1c3f35e657fb972d4209456719a61163831385 --- /dev/null +++ b/ldm_patched/pfn/architecture/OmniSR/OmniSR.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################# +# File: OmniSR.py +# Created Date: Tuesday April 28th 2022 +# Author: Chen Xuanhong +# Email: chenxuanhongzju@outlook.com +# Last Modified: Sunday, 23rd April 2023 3:06:36 pm +# Modified By: Chen Xuanhong +# Copyright (c) 2020 Shanghai Jiao Tong University +############################################################# + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .OSAG import OSAG +from .pixelshuffle import pixelshuffle_block + + +class OmniSR(nn.Module): + def __init__( + self, + state_dict, + **kwargs, + ): + super(OmniSR, self).__init__() + self.state = state_dict + + bias = True # Fine to assume this for now + block_num = 1 # Fine to assume this for now + ffn_bias = True + pe = True + + num_feat = state_dict["input.weight"].shape[0] or 64 + num_in_ch = state_dict["input.weight"].shape[1] or 3 + num_out_ch = num_in_ch # we can just assume this for now. pixelshuffle smh + + pixelshuffle_shape = state_dict["up.0.weight"].shape[0] + up_scale = math.sqrt(pixelshuffle_shape / num_out_ch) + if up_scale - int(up_scale) > 0: + print( + "out_nc is probably different than in_nc, scale calculation might be wrong" + ) + up_scale = int(up_scale) + res_num = 0 + for key in state_dict.keys(): + if "residual_layer" in key: + temp_res_num = int(key.split(".")[1]) + if temp_res_num > res_num: + res_num = temp_res_num + res_num = res_num + 1 # zero-indexed + + residual_layer = [] + self.res_num = res_num + + if ( + "residual_layer.0.residual_layer.0.layer.2.fn.rel_pos_bias.weight" + in state_dict.keys() + ): + rel_pos_bias_weight = state_dict[ + "residual_layer.0.residual_layer.0.layer.2.fn.rel_pos_bias.weight" + ].shape[0] + self.window_size = int((math.sqrt(rel_pos_bias_weight) + 1) / 2) + else: + self.window_size = 8 + + self.up_scale = up_scale + + for _ in range(res_num): + temp_res = OSAG( + channel_num=num_feat, + bias=bias, + block_num=block_num, + ffn_bias=ffn_bias, + window_size=self.window_size, + pe=pe, + ) + residual_layer.append(temp_res) + self.residual_layer = nn.Sequential(*residual_layer) + self.input = nn.Conv2d( + in_channels=num_in_ch, + out_channels=num_feat, + kernel_size=3, + stride=1, + padding=1, + bias=bias, + ) + self.output = nn.Conv2d( + in_channels=num_feat, + out_channels=num_feat, + kernel_size=3, + stride=1, + padding=1, + bias=bias, + ) + self.up = pixelshuffle_block(num_feat, num_out_ch, up_scale, bias=bias) + + # self.tail = pixelshuffle_block(num_feat,num_out_ch,up_scale,bias=bias) + + # for m in self.modules(): + # if isinstance(m, nn.Conv2d): + # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + # m.weight.data.normal_(0, sqrt(2. / n)) + + # chaiNNer specific stuff + self.model_arch = "OmniSR" + self.sub_type = "SR" + self.in_nc = num_in_ch + self.out_nc = num_out_ch + self.num_feat = num_feat + self.scale = up_scale + + self.supports_fp16 = True # TODO: Test this + self.supports_bfp16 = True + self.min_size_restriction = 16 + + self.load_state_dict(state_dict, strict=False) + + def check_image_size(self, x): + _, _, h, w = x.size() + # import pdb; pdb.set_trace() + mod_pad_h = (self.window_size - h % self.window_size) % self.window_size + mod_pad_w = (self.window_size - w % self.window_size) % self.window_size + # x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h), 'reflect') + x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h), "constant", 0) + return x + + def forward(self, x): + H, W = x.shape[2:] + x = self.check_image_size(x) + + residual = self.input(x) + out = self.residual_layer(residual) + + # origin + out = torch.add(self.output(out), residual) + out = self.up(out) + + out = out[:, :, : H * self.up_scale, : W * self.up_scale] + return out diff --git a/ldm_patched/pfn/architecture/OmniSR/__pycache__/OSA.cpython-310.pyc b/ldm_patched/pfn/architecture/OmniSR/__pycache__/OSA.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9c94a299ef2bb012069cb5f36f5ec7b96960fd4 Binary files /dev/null and b/ldm_patched/pfn/architecture/OmniSR/__pycache__/OSA.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/OmniSR/__pycache__/OSAG.cpython-310.pyc b/ldm_patched/pfn/architecture/OmniSR/__pycache__/OSAG.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0bd34319c4ef8100a4107df80aa616d92df46a87 Binary files /dev/null and b/ldm_patched/pfn/architecture/OmniSR/__pycache__/OSAG.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/OmniSR/__pycache__/OmniSR.cpython-310.pyc b/ldm_patched/pfn/architecture/OmniSR/__pycache__/OmniSR.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..14509f1e89005e5bcec53b54b4d6c9c67f968873 Binary files /dev/null and b/ldm_patched/pfn/architecture/OmniSR/__pycache__/OmniSR.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/OmniSR/__pycache__/esa.cpython-310.pyc b/ldm_patched/pfn/architecture/OmniSR/__pycache__/esa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1843ab7e423429fb63ae861246f3b36b107e5732 Binary files /dev/null and b/ldm_patched/pfn/architecture/OmniSR/__pycache__/esa.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/OmniSR/__pycache__/layernorm.cpython-310.pyc b/ldm_patched/pfn/architecture/OmniSR/__pycache__/layernorm.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f0c3df2a6e0e11e80e89dc86a3287af8e9bcc817 Binary files /dev/null and b/ldm_patched/pfn/architecture/OmniSR/__pycache__/layernorm.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/OmniSR/__pycache__/pixelshuffle.cpython-310.pyc b/ldm_patched/pfn/architecture/OmniSR/__pycache__/pixelshuffle.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0eafa3a2a6e65070e504924035c7a36f6ad62d28 Binary files /dev/null and b/ldm_patched/pfn/architecture/OmniSR/__pycache__/pixelshuffle.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/OmniSR/esa.py b/ldm_patched/pfn/architecture/OmniSR/esa.py new file mode 100644 index 0000000000000000000000000000000000000000..f9ce7f7a60bfe20b3737eaa2e3110fd460a2d104 --- /dev/null +++ b/ldm_patched/pfn/architecture/OmniSR/esa.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################# +# File: esa.py +# Created Date: Tuesday April 28th 2022 +# Author: Chen Xuanhong +# Email: chenxuanhongzju@outlook.com +# Last Modified: Thursday, 20th April 2023 9:28:06 am +# Modified By: Chen Xuanhong +# Copyright (c) 2020 Shanghai Jiao Tong University +############################################################# + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .layernorm import LayerNorm2d + + +def moment(x, dim=(2, 3), k=2): + assert len(x.size()) == 4 + mean = torch.mean(x, dim=dim).unsqueeze(-1).unsqueeze(-1) + mk = (1 / (x.size(2) * x.size(3))) * torch.sum(torch.pow(x - mean, k), dim=dim) + return mk + + +class ESA(nn.Module): + """ + Modification of Enhanced Spatial Attention (ESA), which is proposed by + `Residual Feature Aggregation Network for Image Super-Resolution` + Note: `conv_max` and `conv3_` are NOT used here, so the corresponding codes + are deleted. + """ + + def __init__(self, esa_channels, n_feats, conv=nn.Conv2d): + super(ESA, self).__init__() + f = esa_channels + self.conv1 = conv(n_feats, f, kernel_size=1) + self.conv_f = conv(f, f, kernel_size=1) + self.conv2 = conv(f, f, kernel_size=3, stride=2, padding=0) + self.conv3 = conv(f, f, kernel_size=3, padding=1) + self.conv4 = conv(f, n_feats, kernel_size=1) + self.sigmoid = nn.Sigmoid() + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + c1_ = self.conv1(x) + c1 = self.conv2(c1_) + v_max = F.max_pool2d(c1, kernel_size=7, stride=3) + c3 = self.conv3(v_max) + c3 = F.interpolate( + c3, (x.size(2), x.size(3)), mode="bilinear", align_corners=False + ) + cf = self.conv_f(c1_) + c4 = self.conv4(c3 + cf) + m = self.sigmoid(c4) + return x * m + + +class LK_ESA(nn.Module): + def __init__( + self, esa_channels, n_feats, conv=nn.Conv2d, kernel_expand=1, bias=True + ): + super(LK_ESA, self).__init__() + f = esa_channels + self.conv1 = conv(n_feats, f, kernel_size=1) + self.conv_f = conv(f, f, kernel_size=1) + + kernel_size = 17 + kernel_expand = kernel_expand + padding = kernel_size // 2 + + self.vec_conv = nn.Conv2d( + in_channels=f * kernel_expand, + out_channels=f * kernel_expand, + kernel_size=(1, kernel_size), + padding=(0, padding), + groups=2, + bias=bias, + ) + self.vec_conv3x1 = nn.Conv2d( + in_channels=f * kernel_expand, + out_channels=f * kernel_expand, + kernel_size=(1, 3), + padding=(0, 1), + groups=2, + bias=bias, + ) + + self.hor_conv = nn.Conv2d( + in_channels=f * kernel_expand, + out_channels=f * kernel_expand, + kernel_size=(kernel_size, 1), + padding=(padding, 0), + groups=2, + bias=bias, + ) + self.hor_conv1x3 = nn.Conv2d( + in_channels=f * kernel_expand, + out_channels=f * kernel_expand, + kernel_size=(3, 1), + padding=(1, 0), + groups=2, + bias=bias, + ) + + self.conv4 = conv(f, n_feats, kernel_size=1) + self.sigmoid = nn.Sigmoid() + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + c1_ = self.conv1(x) + + res = self.vec_conv(c1_) + self.vec_conv3x1(c1_) + res = self.hor_conv(res) + self.hor_conv1x3(res) + + cf = self.conv_f(c1_) + c4 = self.conv4(res + cf) + m = self.sigmoid(c4) + return x * m + + +class LK_ESA_LN(nn.Module): + def __init__( + self, esa_channels, n_feats, conv=nn.Conv2d, kernel_expand=1, bias=True + ): + super(LK_ESA_LN, self).__init__() + f = esa_channels + self.conv1 = conv(n_feats, f, kernel_size=1) + self.conv_f = conv(f, f, kernel_size=1) + + kernel_size = 17 + kernel_expand = kernel_expand + padding = kernel_size // 2 + + self.norm = LayerNorm2d(n_feats) + + self.vec_conv = nn.Conv2d( + in_channels=f * kernel_expand, + out_channels=f * kernel_expand, + kernel_size=(1, kernel_size), + padding=(0, padding), + groups=2, + bias=bias, + ) + self.vec_conv3x1 = nn.Conv2d( + in_channels=f * kernel_expand, + out_channels=f * kernel_expand, + kernel_size=(1, 3), + padding=(0, 1), + groups=2, + bias=bias, + ) + + self.hor_conv = nn.Conv2d( + in_channels=f * kernel_expand, + out_channels=f * kernel_expand, + kernel_size=(kernel_size, 1), + padding=(padding, 0), + groups=2, + bias=bias, + ) + self.hor_conv1x3 = nn.Conv2d( + in_channels=f * kernel_expand, + out_channels=f * kernel_expand, + kernel_size=(3, 1), + padding=(1, 0), + groups=2, + bias=bias, + ) + + self.conv4 = conv(f, n_feats, kernel_size=1) + self.sigmoid = nn.Sigmoid() + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + c1_ = self.norm(x) + c1_ = self.conv1(c1_) + + res = self.vec_conv(c1_) + self.vec_conv3x1(c1_) + res = self.hor_conv(res) + self.hor_conv1x3(res) + + cf = self.conv_f(c1_) + c4 = self.conv4(res + cf) + m = self.sigmoid(c4) + return x * m + + +class AdaGuidedFilter(nn.Module): + def __init__( + self, esa_channels, n_feats, conv=nn.Conv2d, kernel_expand=1, bias=True + ): + super(AdaGuidedFilter, self).__init__() + + self.gap = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Conv2d( + in_channels=n_feats, + out_channels=1, + kernel_size=1, + padding=0, + stride=1, + groups=1, + bias=True, + ) + + self.r = 5 + + def box_filter(self, x, r): + channel = x.shape[1] + kernel_size = 2 * r + 1 + weight = 1.0 / (kernel_size**2) + box_kernel = weight * torch.ones( + (channel, 1, kernel_size, kernel_size), dtype=torch.float32, device=x.device + ) + output = F.conv2d(x, weight=box_kernel, stride=1, padding=r, groups=channel) + return output + + def forward(self, x): + _, _, H, W = x.shape + N = self.box_filter( + torch.ones((1, 1, H, W), dtype=x.dtype, device=x.device), self.r + ) + + # epsilon = self.fc(self.gap(x)) + # epsilon = torch.pow(epsilon, 2) + epsilon = 1e-2 + + mean_x = self.box_filter(x, self.r) / N + var_x = self.box_filter(x * x, self.r) / N - mean_x * mean_x + + A = var_x / (var_x + epsilon) + b = (1 - A) * mean_x + m = A * x + b + + # mean_A = self.box_filter(A, self.r) / N + # mean_b = self.box_filter(b, self.r) / N + # m = mean_A * x + mean_b + return x * m + + +class AdaConvGuidedFilter(nn.Module): + def __init__( + self, esa_channels, n_feats, conv=nn.Conv2d, kernel_expand=1, bias=True + ): + super(AdaConvGuidedFilter, self).__init__() + f = esa_channels + + self.conv_f = conv(f, f, kernel_size=1) + + kernel_size = 17 + kernel_expand = kernel_expand + padding = kernel_size // 2 + + self.vec_conv = nn.Conv2d( + in_channels=f, + out_channels=f, + kernel_size=(1, kernel_size), + padding=(0, padding), + groups=f, + bias=bias, + ) + + self.hor_conv = nn.Conv2d( + in_channels=f, + out_channels=f, + kernel_size=(kernel_size, 1), + padding=(padding, 0), + groups=f, + bias=bias, + ) + + self.gap = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Conv2d( + in_channels=f, + out_channels=f, + kernel_size=1, + padding=0, + stride=1, + groups=1, + bias=True, + ) + + def forward(self, x): + y = self.vec_conv(x) + y = self.hor_conv(y) + + sigma = torch.pow(y, 2) + epsilon = self.fc(self.gap(y)) + + weight = sigma / (sigma + epsilon) + + m = weight * x + (1 - weight) + + return x * m diff --git a/ldm_patched/pfn/architecture/OmniSR/layernorm.py b/ldm_patched/pfn/architecture/OmniSR/layernorm.py new file mode 100644 index 0000000000000000000000000000000000000000..731a25f7542d45757a284648055d7c6ffad4c3fd --- /dev/null +++ b/ldm_patched/pfn/architecture/OmniSR/layernorm.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################# +# File: layernorm.py +# Created Date: Tuesday April 28th 2022 +# Author: Chen Xuanhong +# Email: chenxuanhongzju@outlook.com +# Last Modified: Thursday, 20th April 2023 9:28:20 am +# Modified By: Chen Xuanhong +# Copyright (c) 2020 Shanghai Jiao Tong University +############################################################# + +import torch +import torch.nn as nn + + +class LayerNormFunction(torch.autograd.Function): + @staticmethod + def forward(ctx, x, weight, bias, eps): + ctx.eps = eps + N, C, H, W = x.size() + mu = x.mean(1, keepdim=True) + var = (x - mu).pow(2).mean(1, keepdim=True) + y = (x - mu) / (var + eps).sqrt() + ctx.save_for_backward(y, var, weight) + y = weight.view(1, C, 1, 1) * y + bias.view(1, C, 1, 1) + return y + + @staticmethod + def backward(ctx, grad_output): + eps = ctx.eps + + N, C, H, W = grad_output.size() + y, var, weight = ctx.saved_variables + g = grad_output * weight.view(1, C, 1, 1) + mean_g = g.mean(dim=1, keepdim=True) + + mean_gy = (g * y).mean(dim=1, keepdim=True) + gx = 1.0 / torch.sqrt(var + eps) * (g - y * mean_gy - mean_g) + return ( + gx, + (grad_output * y).sum(dim=3).sum(dim=2).sum(dim=0), + grad_output.sum(dim=3).sum(dim=2).sum(dim=0), + None, + ) + + +class LayerNorm2d(nn.Module): + def __init__(self, channels, eps=1e-6): + super(LayerNorm2d, self).__init__() + self.register_parameter("weight", nn.Parameter(torch.ones(channels))) + self.register_parameter("bias", nn.Parameter(torch.zeros(channels))) + self.eps = eps + + def forward(self, x): + return LayerNormFunction.apply(x, self.weight, self.bias, self.eps) + + +class GRN(nn.Module): + """GRN (Global Response Normalization) layer""" + + def __init__(self, dim): + super().__init__() + self.gamma = nn.Parameter(torch.zeros(1, dim, 1, 1)) + self.beta = nn.Parameter(torch.zeros(1, dim, 1, 1)) + + def forward(self, x): + Gx = torch.norm(x, p=2, dim=(2, 3), keepdim=True) + Nx = Gx / (Gx.mean(dim=1, keepdim=True) + 1e-6) + return self.gamma * (x * Nx) + self.beta + x diff --git a/ldm_patched/pfn/architecture/OmniSR/pixelshuffle.py b/ldm_patched/pfn/architecture/OmniSR/pixelshuffle.py new file mode 100644 index 0000000000000000000000000000000000000000..4260fb7c9d8d912e34899ce7877595b617f9bb02 --- /dev/null +++ b/ldm_patched/pfn/architecture/OmniSR/pixelshuffle.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +############################################################# +# File: pixelshuffle.py +# Created Date: Friday July 1st 2022 +# Author: Chen Xuanhong +# Email: chenxuanhongzju@outlook.com +# Last Modified: Friday, 1st July 2022 10:18:39 am +# Modified By: Chen Xuanhong +# Copyright (c) 2022 Shanghai Jiao Tong University +############################################################# + +import torch.nn as nn + + +def pixelshuffle_block( + in_channels, out_channels, upscale_factor=2, kernel_size=3, bias=False +): + """ + Upsample features according to `upscale_factor`. + """ + padding = kernel_size // 2 + conv = nn.Conv2d( + in_channels, + out_channels * (upscale_factor**2), + kernel_size, + padding=1, + bias=bias, + ) + pixel_shuffle = nn.PixelShuffle(upscale_factor) + return nn.Sequential(*[conv, pixel_shuffle]) diff --git a/ldm_patched/pfn/architecture/RRDB.py b/ldm_patched/pfn/architecture/RRDB.py new file mode 100644 index 0000000000000000000000000000000000000000..8d318b90b865fecd0b88adc6daf2c6d2e29860a3 --- /dev/null +++ b/ldm_patched/pfn/architecture/RRDB.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import functools +import math +import re +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from . import block as B + + +# Borrowed from https://github.com/rlaphoenix/VSGAN/blob/master/vsgan/archs/esrgan.py +# Which enhanced stuff that was already here +class RRDBNet(nn.Module): + def __init__( + self, + state_dict, + norm=None, + act: str = "leakyrelu", + upsampler: str = "upconv", + mode: B.ConvMode = "CNA", + ) -> None: + """ + ESRGAN - Enhanced Super-Resolution Generative Adversarial Networks. + By Xintao Wang, Ke Yu, Shixiang Wu, Jinjin Gu, Yihao Liu, Chao Dong, Yu Qiao, + and Chen Change Loy. + This is old-arch Residual in Residual Dense Block Network and is not + the newest revision that's available at github.com/xinntao/ESRGAN. + This is on purpose, the newest Network has severely limited the + potential use of the Network with no benefits. + This network supports model files from both new and old-arch. + Args: + norm: Normalization layer + act: Activation layer + upsampler: Upsample layer. upconv, pixel_shuffle + mode: Convolution mode + """ + super(RRDBNet, self).__init__() + self.model_arch = "ESRGAN" + self.sub_type = "SR" + + self.state = state_dict + self.norm = norm + self.act = act + self.upsampler = upsampler + self.mode = mode + + self.state_map = { + # currently supports old, new, and newer RRDBNet arch models + # ESRGAN, BSRGAN/RealSR, Real-ESRGAN + "model.0.weight": ("conv_first.weight",), + "model.0.bias": ("conv_first.bias",), + "model.1.sub./NB/.weight": ("trunk_conv.weight", "conv_body.weight"), + "model.1.sub./NB/.bias": ("trunk_conv.bias", "conv_body.bias"), + r"model.1.sub.\1.RDB\2.conv\3.0.\4": ( + r"RRDB_trunk\.(\d+)\.RDB(\d)\.conv(\d+)\.(weight|bias)", + r"body\.(\d+)\.rdb(\d)\.conv(\d+)\.(weight|bias)", + ), + } + if "params_ema" in self.state: + self.state = self.state["params_ema"] + # self.model_arch = "RealESRGAN" + self.num_blocks = self.get_num_blocks() + self.plus = any("conv1x1" in k for k in self.state.keys()) + if self.plus: + self.model_arch = "ESRGAN+" + + self.state = self.new_to_old_arch(self.state) + + self.key_arr = list(self.state.keys()) + + self.in_nc: int = self.state[self.key_arr[0]].shape[1] + self.out_nc: int = self.state[self.key_arr[-1]].shape[0] + + self.scale: int = self.get_scale() + self.num_filters: int = self.state[self.key_arr[0]].shape[0] + + c2x2 = False + if self.state["model.0.weight"].shape[-2] == 2: + c2x2 = True + self.scale = round(math.sqrt(self.scale / 4)) + self.model_arch = "ESRGAN-2c2" + + self.supports_fp16 = True + self.supports_bfp16 = True + self.min_size_restriction = None + + # Detect if pixelunshuffle was used (Real-ESRGAN) + if self.in_nc in (self.out_nc * 4, self.out_nc * 16) and self.out_nc in ( + self.in_nc / 4, + self.in_nc / 16, + ): + self.shuffle_factor = int(math.sqrt(self.in_nc / self.out_nc)) + else: + self.shuffle_factor = None + + upsample_block = { + "upconv": B.upconv_block, + "pixel_shuffle": B.pixelshuffle_block, + }.get(self.upsampler) + if upsample_block is None: + raise NotImplementedError(f"Upsample mode [{self.upsampler}] is not found") + + if self.scale == 3: + upsample_blocks = upsample_block( + in_nc=self.num_filters, + out_nc=self.num_filters, + upscale_factor=3, + act_type=self.act, + c2x2=c2x2, + ) + else: + upsample_blocks = [ + upsample_block( + in_nc=self.num_filters, + out_nc=self.num_filters, + act_type=self.act, + c2x2=c2x2, + ) + for _ in range(int(math.log(self.scale, 2))) + ] + + self.model = B.sequential( + # fea conv + B.conv_block( + in_nc=self.in_nc, + out_nc=self.num_filters, + kernel_size=3, + norm_type=None, + act_type=None, + c2x2=c2x2, + ), + B.ShortcutBlock( + B.sequential( + # rrdb blocks + *[ + B.RRDB( + nf=self.num_filters, + kernel_size=3, + gc=32, + stride=1, + bias=True, + pad_type="zero", + norm_type=self.norm, + act_type=self.act, + mode="CNA", + plus=self.plus, + c2x2=c2x2, + ) + for _ in range(self.num_blocks) + ], + # lr conv + B.conv_block( + in_nc=self.num_filters, + out_nc=self.num_filters, + kernel_size=3, + norm_type=self.norm, + act_type=None, + mode=self.mode, + c2x2=c2x2, + ), + ) + ), + *upsample_blocks, + # hr_conv0 + B.conv_block( + in_nc=self.num_filters, + out_nc=self.num_filters, + kernel_size=3, + norm_type=None, + act_type=self.act, + c2x2=c2x2, + ), + # hr_conv1 + B.conv_block( + in_nc=self.num_filters, + out_nc=self.out_nc, + kernel_size=3, + norm_type=None, + act_type=None, + c2x2=c2x2, + ), + ) + + # Adjust these properties for calculations outside of the model + if self.shuffle_factor: + self.in_nc //= self.shuffle_factor**2 + self.scale //= self.shuffle_factor + + self.load_state_dict(self.state, strict=False) + + def new_to_old_arch(self, state): + """Convert a new-arch model state dictionary to an old-arch dictionary.""" + if "params_ema" in state: + state = state["params_ema"] + + if "conv_first.weight" not in state: + # model is already old arch, this is a loose check, but should be sufficient + return state + + # add nb to state keys + for kind in ("weight", "bias"): + self.state_map[f"model.1.sub.{self.num_blocks}.{kind}"] = self.state_map[ + f"model.1.sub./NB/.{kind}" + ] + del self.state_map[f"model.1.sub./NB/.{kind}"] + + old_state = OrderedDict() + for old_key, new_keys in self.state_map.items(): + for new_key in new_keys: + if r"\1" in old_key: + for k, v in state.items(): + sub = re.sub(new_key, old_key, k) + if sub != k: + old_state[sub] = v + else: + if new_key in state: + old_state[old_key] = state[new_key] + + # upconv layers + max_upconv = 0 + for key in state.keys(): + match = re.match(r"(upconv|conv_up)(\d)\.(weight|bias)", key) + if match is not None: + _, key_num, key_type = match.groups() + old_state[f"model.{int(key_num) * 3}.{key_type}"] = state[key] + max_upconv = max(max_upconv, int(key_num) * 3) + + # final layers + for key in state.keys(): + if key in ("HRconv.weight", "conv_hr.weight"): + old_state[f"model.{max_upconv + 2}.weight"] = state[key] + elif key in ("HRconv.bias", "conv_hr.bias"): + old_state[f"model.{max_upconv + 2}.bias"] = state[key] + elif key in ("conv_last.weight",): + old_state[f"model.{max_upconv + 4}.weight"] = state[key] + elif key in ("conv_last.bias",): + old_state[f"model.{max_upconv + 4}.bias"] = state[key] + + # Sort by first numeric value of each layer + def compare(item1, item2): + parts1 = item1.split(".") + parts2 = item2.split(".") + int1 = int(parts1[1]) + int2 = int(parts2[1]) + return int1 - int2 + + sorted_keys = sorted(old_state.keys(), key=functools.cmp_to_key(compare)) + + # Rebuild the output dict in the right order + out_dict = OrderedDict((k, old_state[k]) for k in sorted_keys) + + return out_dict + + def get_scale(self, min_part: int = 6) -> int: + n = 0 + for part in list(self.state): + parts = part.split(".")[1:] + if len(parts) == 2: + part_num = int(parts[0]) + if part_num > min_part and parts[1] == "weight": + n += 1 + return 2**n + + def get_num_blocks(self) -> int: + nbs = [] + state_keys = self.state_map[r"model.1.sub.\1.RDB\2.conv\3.0.\4"] + ( + r"model\.\d+\.sub\.(\d+)\.RDB(\d+)\.conv(\d+)\.0\.(weight|bias)", + ) + for state_key in state_keys: + for k in self.state: + m = re.search(state_key, k) + if m: + nbs.append(int(m.group(1))) + if nbs: + break + return max(*nbs) + 1 + + def forward(self, x): + if self.shuffle_factor: + _, _, h, w = x.size() + mod_pad_h = ( + self.shuffle_factor - h % self.shuffle_factor + ) % self.shuffle_factor + mod_pad_w = ( + self.shuffle_factor - w % self.shuffle_factor + ) % self.shuffle_factor + x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h), "reflect") + x = torch.pixel_unshuffle(x, downscale_factor=self.shuffle_factor) + x = self.model(x) + return x[:, :, : h * self.scale, : w * self.scale] + return self.model(x) diff --git a/ldm_patched/pfn/architecture/SCUNet.py b/ldm_patched/pfn/architecture/SCUNet.py new file mode 100644 index 0000000000000000000000000000000000000000..b8354a873085140e9ff7d582c43ba9818ed9524e --- /dev/null +++ b/ldm_patched/pfn/architecture/SCUNet.py @@ -0,0 +1,455 @@ +# pylint: skip-file +# ----------------------------------------------------------------------------------- +# SCUNet: Practical Blind Denoising via Swin-Conv-UNet and Data Synthesis, https://arxiv.org/abs/2203.13278 +# Zhang, Kai and Li, Yawei and Liang, Jingyun and Cao, Jiezhang and Zhang, Yulun and Tang, Hao and Timofte, Radu and Van Gool, Luc +# ----------------------------------------------------------------------------------- + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange +from einops.layers.torch import Rearrange + +from .timm.drop import DropPath +from .timm.weight_init import trunc_normal_ + + +# Borrowed from https://github.com/cszn/SCUNet/blob/main/models/network_scunet.py +class WMSA(nn.Module): + """Self-attention module in Swin Transformer""" + + def __init__(self, input_dim, output_dim, head_dim, window_size, type): + super(WMSA, self).__init__() + self.input_dim = input_dim + self.output_dim = output_dim + self.head_dim = head_dim + self.scale = self.head_dim**-0.5 + self.n_heads = input_dim // head_dim + self.window_size = window_size + self.type = type + self.embedding_layer = nn.Linear(self.input_dim, 3 * self.input_dim, bias=True) + + self.relative_position_params = nn.Parameter( + torch.zeros((2 * window_size - 1) * (2 * window_size - 1), self.n_heads) + ) + # TODO recover + # self.relative_position_params = nn.Parameter(torch.zeros(self.n_heads, 2 * window_size - 1, 2 * window_size -1)) + self.relative_position_params = nn.Parameter( + torch.zeros((2 * window_size - 1) * (2 * window_size - 1), self.n_heads) + ) + + self.linear = nn.Linear(self.input_dim, self.output_dim) + + trunc_normal_(self.relative_position_params, std=0.02) + self.relative_position_params = torch.nn.Parameter( + self.relative_position_params.view( + 2 * window_size - 1, 2 * window_size - 1, self.n_heads + ) + .transpose(1, 2) + .transpose(0, 1) + ) + + def generate_mask(self, h, w, p, shift): + """generating the mask of SW-MSA + Args: + shift: shift parameters in CyclicShift. + Returns: + attn_mask: should be (1 1 w p p), + """ + # supporting square. + attn_mask = torch.zeros( + h, + w, + p, + p, + p, + p, + dtype=torch.bool, + device=self.relative_position_params.device, + ) + if self.type == "W": + return attn_mask + + s = p - shift + attn_mask[-1, :, :s, :, s:, :] = True + attn_mask[-1, :, s:, :, :s, :] = True + attn_mask[:, -1, :, :s, :, s:] = True + attn_mask[:, -1, :, s:, :, :s] = True + attn_mask = rearrange( + attn_mask, "w1 w2 p1 p2 p3 p4 -> 1 1 (w1 w2) (p1 p2) (p3 p4)" + ) + return attn_mask + + def forward(self, x): + """Forward pass of Window Multi-head Self-attention module. + Args: + x: input tensor with shape of [b h w c]; + attn_mask: attention mask, fill -inf where the value is True; + Returns: + output: tensor shape [b h w c] + """ + if self.type != "W": + x = torch.roll( + x, + shifts=(-(self.window_size // 2), -(self.window_size // 2)), + dims=(1, 2), + ) + + x = rearrange( + x, + "b (w1 p1) (w2 p2) c -> b w1 w2 p1 p2 c", + p1=self.window_size, + p2=self.window_size, + ) + h_windows = x.size(1) + w_windows = x.size(2) + # square validation + # assert h_windows == w_windows + + x = rearrange( + x, + "b w1 w2 p1 p2 c -> b (w1 w2) (p1 p2) c", + p1=self.window_size, + p2=self.window_size, + ) + qkv = self.embedding_layer(x) + q, k, v = rearrange( + qkv, "b nw np (threeh c) -> threeh b nw np c", c=self.head_dim + ).chunk(3, dim=0) + sim = torch.einsum("hbwpc,hbwqc->hbwpq", q, k) * self.scale + # Adding learnable relative embedding + sim = sim + rearrange(self.relative_embedding(), "h p q -> h 1 1 p q") + # Using Attn Mask to distinguish different subwindows. + if self.type != "W": + attn_mask = self.generate_mask( + h_windows, w_windows, self.window_size, shift=self.window_size // 2 + ) + sim = sim.masked_fill_(attn_mask, float("-inf")) + + probs = nn.functional.softmax(sim, dim=-1) + output = torch.einsum("hbwij,hbwjc->hbwic", probs, v) + output = rearrange(output, "h b w p c -> b w p (h c)") + output = self.linear(output) + output = rearrange( + output, + "b (w1 w2) (p1 p2) c -> b (w1 p1) (w2 p2) c", + w1=h_windows, + p1=self.window_size, + ) + + if self.type != "W": + output = torch.roll( + output, + shifts=(self.window_size // 2, self.window_size // 2), + dims=(1, 2), + ) + + return output + + def relative_embedding(self): + cord = torch.tensor( + np.array( + [ + [i, j] + for i in range(self.window_size) + for j in range(self.window_size) + ] + ) + ) + relation = cord[:, None, :] - cord[None, :, :] + self.window_size - 1 + # negative is allowed + return self.relative_position_params[ + :, relation[:, :, 0].long(), relation[:, :, 1].long() + ] + + +class Block(nn.Module): + def __init__( + self, + input_dim, + output_dim, + head_dim, + window_size, + drop_path, + type="W", + input_resolution=None, + ): + """SwinTransformer Block""" + super(Block, self).__init__() + self.input_dim = input_dim + self.output_dim = output_dim + assert type in ["W", "SW"] + self.type = type + if input_resolution <= window_size: + self.type = "W" + + self.ln1 = nn.LayerNorm(input_dim) + self.msa = WMSA(input_dim, input_dim, head_dim, window_size, self.type) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.ln2 = nn.LayerNorm(input_dim) + self.mlp = nn.Sequential( + nn.Linear(input_dim, 4 * input_dim), + nn.GELU(), + nn.Linear(4 * input_dim, output_dim), + ) + + def forward(self, x): + x = x + self.drop_path(self.msa(self.ln1(x))) + x = x + self.drop_path(self.mlp(self.ln2(x))) + return x + + +class ConvTransBlock(nn.Module): + def __init__( + self, + conv_dim, + trans_dim, + head_dim, + window_size, + drop_path, + type="W", + input_resolution=None, + ): + """SwinTransformer and Conv Block""" + super(ConvTransBlock, self).__init__() + self.conv_dim = conv_dim + self.trans_dim = trans_dim + self.head_dim = head_dim + self.window_size = window_size + self.drop_path = drop_path + self.type = type + self.input_resolution = input_resolution + + assert self.type in ["W", "SW"] + if self.input_resolution <= self.window_size: + self.type = "W" + + self.trans_block = Block( + self.trans_dim, + self.trans_dim, + self.head_dim, + self.window_size, + self.drop_path, + self.type, + self.input_resolution, + ) + self.conv1_1 = nn.Conv2d( + self.conv_dim + self.trans_dim, + self.conv_dim + self.trans_dim, + 1, + 1, + 0, + bias=True, + ) + self.conv1_2 = nn.Conv2d( + self.conv_dim + self.trans_dim, + self.conv_dim + self.trans_dim, + 1, + 1, + 0, + bias=True, + ) + + self.conv_block = nn.Sequential( + nn.Conv2d(self.conv_dim, self.conv_dim, 3, 1, 1, bias=False), + nn.ReLU(True), + nn.Conv2d(self.conv_dim, self.conv_dim, 3, 1, 1, bias=False), + ) + + def forward(self, x): + conv_x, trans_x = torch.split( + self.conv1_1(x), (self.conv_dim, self.trans_dim), dim=1 + ) + conv_x = self.conv_block(conv_x) + conv_x + trans_x = Rearrange("b c h w -> b h w c")(trans_x) + trans_x = self.trans_block(trans_x) + trans_x = Rearrange("b h w c -> b c h w")(trans_x) + res = self.conv1_2(torch.cat((conv_x, trans_x), dim=1)) + x = x + res + + return x + + +class SCUNet(nn.Module): + def __init__( + self, + state_dict, + in_nc=3, + config=[4, 4, 4, 4, 4, 4, 4], + dim=64, + drop_path_rate=0.0, + input_resolution=256, + ): + super(SCUNet, self).__init__() + self.model_arch = "SCUNet" + self.sub_type = "SR" + + self.num_filters: int = 0 + + self.state = state_dict + self.config = config + self.dim = dim + self.head_dim = 32 + self.window_size = 8 + + self.in_nc = in_nc + self.out_nc = self.in_nc + self.scale = 1 + self.supports_fp16 = True + + # drop path rate for each layer + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(config))] + + self.m_head = [nn.Conv2d(in_nc, dim, 3, 1, 1, bias=False)] + + begin = 0 + self.m_down1 = [ + ConvTransBlock( + dim // 2, + dim // 2, + self.head_dim, + self.window_size, + dpr[i + begin], + "W" if not i % 2 else "SW", + input_resolution, + ) + for i in range(config[0]) + ] + [nn.Conv2d(dim, 2 * dim, 2, 2, 0, bias=False)] + + begin += config[0] + self.m_down2 = [ + ConvTransBlock( + dim, + dim, + self.head_dim, + self.window_size, + dpr[i + begin], + "W" if not i % 2 else "SW", + input_resolution // 2, + ) + for i in range(config[1]) + ] + [nn.Conv2d(2 * dim, 4 * dim, 2, 2, 0, bias=False)] + + begin += config[1] + self.m_down3 = [ + ConvTransBlock( + 2 * dim, + 2 * dim, + self.head_dim, + self.window_size, + dpr[i + begin], + "W" if not i % 2 else "SW", + input_resolution // 4, + ) + for i in range(config[2]) + ] + [nn.Conv2d(4 * dim, 8 * dim, 2, 2, 0, bias=False)] + + begin += config[2] + self.m_body = [ + ConvTransBlock( + 4 * dim, + 4 * dim, + self.head_dim, + self.window_size, + dpr[i + begin], + "W" if not i % 2 else "SW", + input_resolution // 8, + ) + for i in range(config[3]) + ] + + begin += config[3] + self.m_up3 = [ + nn.ConvTranspose2d(8 * dim, 4 * dim, 2, 2, 0, bias=False), + ] + [ + ConvTransBlock( + 2 * dim, + 2 * dim, + self.head_dim, + self.window_size, + dpr[i + begin], + "W" if not i % 2 else "SW", + input_resolution // 4, + ) + for i in range(config[4]) + ] + + begin += config[4] + self.m_up2 = [ + nn.ConvTranspose2d(4 * dim, 2 * dim, 2, 2, 0, bias=False), + ] + [ + ConvTransBlock( + dim, + dim, + self.head_dim, + self.window_size, + dpr[i + begin], + "W" if not i % 2 else "SW", + input_resolution // 2, + ) + for i in range(config[5]) + ] + + begin += config[5] + self.m_up1 = [ + nn.ConvTranspose2d(2 * dim, dim, 2, 2, 0, bias=False), + ] + [ + ConvTransBlock( + dim // 2, + dim // 2, + self.head_dim, + self.window_size, + dpr[i + begin], + "W" if not i % 2 else "SW", + input_resolution, + ) + for i in range(config[6]) + ] + + self.m_tail = [nn.Conv2d(dim, in_nc, 3, 1, 1, bias=False)] + + self.m_head = nn.Sequential(*self.m_head) + self.m_down1 = nn.Sequential(*self.m_down1) + self.m_down2 = nn.Sequential(*self.m_down2) + self.m_down3 = nn.Sequential(*self.m_down3) + self.m_body = nn.Sequential(*self.m_body) + self.m_up3 = nn.Sequential(*self.m_up3) + self.m_up2 = nn.Sequential(*self.m_up2) + self.m_up1 = nn.Sequential(*self.m_up1) + self.m_tail = nn.Sequential(*self.m_tail) + # self.apply(self._init_weights) + self.load_state_dict(state_dict, strict=True) + + def check_image_size(self, x): + _, _, h, w = x.size() + mod_pad_h = (64 - h % 64) % 64 + mod_pad_w = (64 - w % 64) % 64 + x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h), "reflect") + return x + + def forward(self, x0): + h, w = x0.size()[-2:] + x0 = self.check_image_size(x0) + + x1 = self.m_head(x0) + x2 = self.m_down1(x1) + x3 = self.m_down2(x2) + x4 = self.m_down3(x3) + x = self.m_body(x4) + x = self.m_up3(x + x4) + x = self.m_up2(x + x3) + x = self.m_up1(x + x2) + x = self.m_tail(x + x1) + + x = x[:, :, :h, :w] + return x + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) diff --git a/ldm_patched/pfn/architecture/SPSR.py b/ldm_patched/pfn/architecture/SPSR.py new file mode 100644 index 0000000000000000000000000000000000000000..c3cefff190292a63cf61fe3fa9c28131dac4f369 --- /dev/null +++ b/ldm_patched/pfn/architecture/SPSR.py @@ -0,0 +1,383 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from . import block as B + + +class Get_gradient_nopadding(nn.Module): + def __init__(self): + super(Get_gradient_nopadding, self).__init__() + kernel_v = [[0, -1, 0], [0, 0, 0], [0, 1, 0]] + kernel_h = [[0, 0, 0], [-1, 0, 1], [0, 0, 0]] + kernel_h = torch.FloatTensor(kernel_h).unsqueeze(0).unsqueeze(0) + kernel_v = torch.FloatTensor(kernel_v).unsqueeze(0).unsqueeze(0) + self.weight_h = nn.Parameter(data=kernel_h, requires_grad=False) # type: ignore + + self.weight_v = nn.Parameter(data=kernel_v, requires_grad=False) # type: ignore + + def forward(self, x): + x_list = [] + for i in range(x.shape[1]): + x_i = x[:, i] + x_i_v = F.conv2d(x_i.unsqueeze(1), self.weight_v, padding=1) + x_i_h = F.conv2d(x_i.unsqueeze(1), self.weight_h, padding=1) + x_i = torch.sqrt(torch.pow(x_i_v, 2) + torch.pow(x_i_h, 2) + 1e-6) + x_list.append(x_i) + + x = torch.cat(x_list, dim=1) + + return x + + +class SPSRNet(nn.Module): + def __init__( + self, + state_dict, + norm=None, + act: str = "leakyrelu", + upsampler: str = "upconv", + mode: B.ConvMode = "CNA", + ): + super(SPSRNet, self).__init__() + self.model_arch = "SPSR" + self.sub_type = "SR" + + self.state = state_dict + self.norm = norm + self.act = act + self.upsampler = upsampler + self.mode = mode + + self.num_blocks = self.get_num_blocks() + + self.in_nc: int = self.state["model.0.weight"].shape[1] + self.out_nc: int = self.state["f_HR_conv1.0.bias"].shape[0] + + self.scale = self.get_scale(4) + self.num_filters: int = self.state["model.0.weight"].shape[0] + + self.supports_fp16 = True + self.supports_bfp16 = True + self.min_size_restriction = None + + n_upscale = int(math.log(self.scale, 2)) + if self.scale == 3: + n_upscale = 1 + + fea_conv = B.conv_block( + self.in_nc, self.num_filters, kernel_size=3, norm_type=None, act_type=None + ) + rb_blocks = [ + B.RRDB( + self.num_filters, + kernel_size=3, + gc=32, + stride=1, + bias=True, + pad_type="zero", + norm_type=norm, + act_type=act, + mode="CNA", + ) + for _ in range(self.num_blocks) + ] + LR_conv = B.conv_block( + self.num_filters, + self.num_filters, + kernel_size=3, + norm_type=norm, + act_type=None, + mode=mode, + ) + + if upsampler == "upconv": + upsample_block = B.upconv_block + elif upsampler == "pixelshuffle": + upsample_block = B.pixelshuffle_block + else: + raise NotImplementedError(f"upsample mode [{upsampler}] is not found") + if self.scale == 3: + a_upsampler = upsample_block( + self.num_filters, self.num_filters, 3, act_type=act + ) + else: + a_upsampler = [ + upsample_block(self.num_filters, self.num_filters, act_type=act) + for _ in range(n_upscale) + ] + self.HR_conv0_new = B.conv_block( + self.num_filters, + self.num_filters, + kernel_size=3, + norm_type=None, + act_type=act, + ) + self.HR_conv1_new = B.conv_block( + self.num_filters, + self.num_filters, + kernel_size=3, + norm_type=None, + act_type=None, + ) + + self.model = B.sequential( + fea_conv, + B.ShortcutBlockSPSR(B.sequential(*rb_blocks, LR_conv)), + *a_upsampler, + self.HR_conv0_new, + ) + + self.get_g_nopadding = Get_gradient_nopadding() + + self.b_fea_conv = B.conv_block( + self.in_nc, self.num_filters, kernel_size=3, norm_type=None, act_type=None + ) + + self.b_concat_1 = B.conv_block( + 2 * self.num_filters, + self.num_filters, + kernel_size=3, + norm_type=None, + act_type=None, + ) + self.b_block_1 = B.RRDB( + self.num_filters * 2, + kernel_size=3, + gc=32, + stride=1, + bias=True, + pad_type="zero", + norm_type=norm, + act_type=act, + mode="CNA", + ) + + self.b_concat_2 = B.conv_block( + 2 * self.num_filters, + self.num_filters, + kernel_size=3, + norm_type=None, + act_type=None, + ) + self.b_block_2 = B.RRDB( + self.num_filters * 2, + kernel_size=3, + gc=32, + stride=1, + bias=True, + pad_type="zero", + norm_type=norm, + act_type=act, + mode="CNA", + ) + + self.b_concat_3 = B.conv_block( + 2 * self.num_filters, + self.num_filters, + kernel_size=3, + norm_type=None, + act_type=None, + ) + self.b_block_3 = B.RRDB( + self.num_filters * 2, + kernel_size=3, + gc=32, + stride=1, + bias=True, + pad_type="zero", + norm_type=norm, + act_type=act, + mode="CNA", + ) + + self.b_concat_4 = B.conv_block( + 2 * self.num_filters, + self.num_filters, + kernel_size=3, + norm_type=None, + act_type=None, + ) + self.b_block_4 = B.RRDB( + self.num_filters * 2, + kernel_size=3, + gc=32, + stride=1, + bias=True, + pad_type="zero", + norm_type=norm, + act_type=act, + mode="CNA", + ) + + self.b_LR_conv = B.conv_block( + self.num_filters, + self.num_filters, + kernel_size=3, + norm_type=norm, + act_type=None, + mode=mode, + ) + + if upsampler == "upconv": + upsample_block = B.upconv_block + elif upsampler == "pixelshuffle": + upsample_block = B.pixelshuffle_block + else: + raise NotImplementedError(f"upsample mode [{upsampler}] is not found") + if self.scale == 3: + b_upsampler = upsample_block( + self.num_filters, self.num_filters, 3, act_type=act + ) + else: + b_upsampler = [ + upsample_block(self.num_filters, self.num_filters, act_type=act) + for _ in range(n_upscale) + ] + + b_HR_conv0 = B.conv_block( + self.num_filters, + self.num_filters, + kernel_size=3, + norm_type=None, + act_type=act, + ) + b_HR_conv1 = B.conv_block( + self.num_filters, + self.num_filters, + kernel_size=3, + norm_type=None, + act_type=None, + ) + + self.b_module = B.sequential(*b_upsampler, b_HR_conv0, b_HR_conv1) + + self.conv_w = B.conv_block( + self.num_filters, self.out_nc, kernel_size=1, norm_type=None, act_type=None + ) + + self.f_concat = B.conv_block( + self.num_filters * 2, + self.num_filters, + kernel_size=3, + norm_type=None, + act_type=None, + ) + + self.f_block = B.RRDB( + self.num_filters * 2, + kernel_size=3, + gc=32, + stride=1, + bias=True, + pad_type="zero", + norm_type=norm, + act_type=act, + mode="CNA", + ) + + self.f_HR_conv0 = B.conv_block( + self.num_filters, + self.num_filters, + kernel_size=3, + norm_type=None, + act_type=act, + ) + self.f_HR_conv1 = B.conv_block( + self.num_filters, self.out_nc, kernel_size=3, norm_type=None, act_type=None + ) + + self.load_state_dict(self.state, strict=False) + + def get_scale(self, min_part: int = 4) -> int: + n = 0 + for part in list(self.state): + parts = part.split(".") + if len(parts) == 3: + part_num = int(parts[1]) + if part_num > min_part and parts[0] == "model" and parts[2] == "weight": + n += 1 + return 2**n + + def get_num_blocks(self) -> int: + nb = 0 + for part in list(self.state): + parts = part.split(".") + n_parts = len(parts) + if n_parts == 5 and parts[2] == "sub": + nb = int(parts[3]) + return nb + + def forward(self, x): + x_grad = self.get_g_nopadding(x) + x = self.model[0](x) + + x, block_list = self.model[1](x) + + x_ori = x + for i in range(5): + x = block_list[i](x) + x_fea1 = x + + for i in range(5): + x = block_list[i + 5](x) + x_fea2 = x + + for i in range(5): + x = block_list[i + 10](x) + x_fea3 = x + + for i in range(5): + x = block_list[i + 15](x) + x_fea4 = x + + x = block_list[20:](x) + # short cut + x = x_ori + x + x = self.model[2:](x) + x = self.HR_conv1_new(x) + + x_b_fea = self.b_fea_conv(x_grad) + x_cat_1 = torch.cat([x_b_fea, x_fea1], dim=1) + + x_cat_1 = self.b_block_1(x_cat_1) + x_cat_1 = self.b_concat_1(x_cat_1) + + x_cat_2 = torch.cat([x_cat_1, x_fea2], dim=1) + + x_cat_2 = self.b_block_2(x_cat_2) + x_cat_2 = self.b_concat_2(x_cat_2) + + x_cat_3 = torch.cat([x_cat_2, x_fea3], dim=1) + + x_cat_3 = self.b_block_3(x_cat_3) + x_cat_3 = self.b_concat_3(x_cat_3) + + x_cat_4 = torch.cat([x_cat_3, x_fea4], dim=1) + + x_cat_4 = self.b_block_4(x_cat_4) + x_cat_4 = self.b_concat_4(x_cat_4) + + x_cat_4 = self.b_LR_conv(x_cat_4) + + # short cut + x_cat_4 = x_cat_4 + x_b_fea + x_branch = self.b_module(x_cat_4) + + # x_out_branch = self.conv_w(x_branch) + ######## + x_branch_d = x_branch + x_f_cat = torch.cat([x_branch_d, x], dim=1) + x_f_cat = self.f_block(x_f_cat) + x_out = self.f_concat(x_f_cat) + x_out = self.f_HR_conv0(x_out) + x_out = self.f_HR_conv1(x_out) + + ######### + # return x_out_branch, x_out, x_grad + return x_out diff --git a/ldm_patched/pfn/architecture/SRVGG.py b/ldm_patched/pfn/architecture/SRVGG.py new file mode 100644 index 0000000000000000000000000000000000000000..7a8ec37ae5dc4effd0ba688cf4c3a51801e1f2c9 --- /dev/null +++ b/ldm_patched/pfn/architecture/SRVGG.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import math + +import torch.nn as nn +import torch.nn.functional as F + + +class SRVGGNetCompact(nn.Module): + """A compact VGG-style network structure for super-resolution. + It is a compact network structure, which performs upsampling in the last layer and no convolution is + conducted on the HR feature space. + Args: + num_in_ch (int): Channel number of inputs. Default: 3. + num_out_ch (int): Channel number of outputs. Default: 3. + num_feat (int): Channel number of intermediate features. Default: 64. + num_conv (int): Number of convolution layers in the body network. Default: 16. + upscale (int): Upsampling factor. Default: 4. + act_type (str): Activation type, options: 'relu', 'prelu', 'leakyrelu'. Default: prelu. + """ + + def __init__( + self, + state_dict, + act_type: str = "prelu", + ): + super(SRVGGNetCompact, self).__init__() + self.model_arch = "SRVGG (RealESRGAN)" + self.sub_type = "SR" + + self.act_type = act_type + + self.state = state_dict + + if "params" in self.state: + self.state = self.state["params"] + + self.key_arr = list(self.state.keys()) + + self.in_nc = self.get_in_nc() + self.num_feat = self.get_num_feats() + self.num_conv = self.get_num_conv() + self.out_nc = self.in_nc # :( + self.pixelshuffle_shape = None # Defined in get_scale() + self.scale = self.get_scale() + + self.supports_fp16 = True + self.supports_bfp16 = True + self.min_size_restriction = None + + self.body = nn.ModuleList() + # the first conv + self.body.append(nn.Conv2d(self.in_nc, self.num_feat, 3, 1, 1)) + # the first activation + if act_type == "relu": + activation = nn.ReLU(inplace=True) + elif act_type == "prelu": + activation = nn.PReLU(num_parameters=self.num_feat) + elif act_type == "leakyrelu": + activation = nn.LeakyReLU(negative_slope=0.1, inplace=True) + self.body.append(activation) # type: ignore + + # the body structure + for _ in range(self.num_conv): + self.body.append(nn.Conv2d(self.num_feat, self.num_feat, 3, 1, 1)) + # activation + if act_type == "relu": + activation = nn.ReLU(inplace=True) + elif act_type == "prelu": + activation = nn.PReLU(num_parameters=self.num_feat) + elif act_type == "leakyrelu": + activation = nn.LeakyReLU(negative_slope=0.1, inplace=True) + self.body.append(activation) # type: ignore + + # the last conv + self.body.append(nn.Conv2d(self.num_feat, self.pixelshuffle_shape, 3, 1, 1)) # type: ignore + # upsample + self.upsampler = nn.PixelShuffle(self.scale) + + self.load_state_dict(self.state, strict=False) + + def get_num_conv(self) -> int: + return (int(self.key_arr[-1].split(".")[1]) - 2) // 2 + + def get_num_feats(self) -> int: + return self.state[self.key_arr[0]].shape[0] + + def get_in_nc(self) -> int: + return self.state[self.key_arr[0]].shape[1] + + def get_scale(self) -> int: + self.pixelshuffle_shape = self.state[self.key_arr[-1]].shape[0] + # Assume out_nc is the same as in_nc + # I cant think of a better way to do that + self.out_nc = self.in_nc + scale = math.sqrt(self.pixelshuffle_shape / self.out_nc) + if scale - int(scale) > 0: + print( + "out_nc is probably different than in_nc, scale calculation might be wrong" + ) + scale = int(scale) + return scale + + def forward(self, x): + out = x + for i in range(0, len(self.body)): + out = self.body[i](out) + + out = self.upsampler(out) + # add the nearest upsampled image, so that the network learns the residual + base = F.interpolate(x, scale_factor=self.scale, mode="nearest") + out += base + return out diff --git a/ldm_patched/pfn/architecture/SwiftSRGAN.py b/ldm_patched/pfn/architecture/SwiftSRGAN.py new file mode 100644 index 0000000000000000000000000000000000000000..dbb7725b08dc2462661b7ba45db605a06fadacb9 --- /dev/null +++ b/ldm_patched/pfn/architecture/SwiftSRGAN.py @@ -0,0 +1,161 @@ +# From https://github.com/Koushik0901/Swift-SRGAN/blob/master/swift-srgan/models.py + +import torch +from torch import nn + + +class SeperableConv2d(nn.Module): + def __init__( + self, in_channels, out_channels, kernel_size, stride=1, padding=1, bias=True + ): + super(SeperableConv2d, self).__init__() + self.depthwise = nn.Conv2d( + in_channels, + in_channels, + kernel_size=kernel_size, + stride=stride, + groups=in_channels, + bias=bias, + padding=padding, + ) + self.pointwise = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=bias) + + def forward(self, x): + return self.pointwise(self.depthwise(x)) + + +class ConvBlock(nn.Module): + def __init__( + self, + in_channels, + out_channels, + use_act=True, + use_bn=True, + discriminator=False, + **kwargs, + ): + super(ConvBlock, self).__init__() + + self.use_act = use_act + self.cnn = SeperableConv2d(in_channels, out_channels, **kwargs, bias=not use_bn) + self.bn = nn.BatchNorm2d(out_channels) if use_bn else nn.Identity() + self.act = ( + nn.LeakyReLU(0.2, inplace=True) + if discriminator + else nn.PReLU(num_parameters=out_channels) + ) + + def forward(self, x): + return self.act(self.bn(self.cnn(x))) if self.use_act else self.bn(self.cnn(x)) + + +class UpsampleBlock(nn.Module): + def __init__(self, in_channels, scale_factor): + super(UpsampleBlock, self).__init__() + + self.conv = SeperableConv2d( + in_channels, + in_channels * scale_factor**2, + kernel_size=3, + stride=1, + padding=1, + ) + self.ps = nn.PixelShuffle( + scale_factor + ) # (in_channels * 4, H, W) -> (in_channels, H*2, W*2) + self.act = nn.PReLU(num_parameters=in_channels) + + def forward(self, x): + return self.act(self.ps(self.conv(x))) + + +class ResidualBlock(nn.Module): + def __init__(self, in_channels): + super(ResidualBlock, self).__init__() + + self.block1 = ConvBlock( + in_channels, in_channels, kernel_size=3, stride=1, padding=1 + ) + self.block2 = ConvBlock( + in_channels, in_channels, kernel_size=3, stride=1, padding=1, use_act=False + ) + + def forward(self, x): + out = self.block1(x) + out = self.block2(out) + return out + x + + +class Generator(nn.Module): + """Swift-SRGAN Generator + Args: + in_channels (int): number of input image channels. + num_channels (int): number of hidden channels. + num_blocks (int): number of residual blocks. + upscale_factor (int): factor to upscale the image [2x, 4x, 8x]. + Returns: + torch.Tensor: super resolution image + """ + + def __init__( + self, + state_dict, + ): + super(Generator, self).__init__() + self.model_arch = "Swift-SRGAN" + self.sub_type = "SR" + self.state = state_dict + if "model" in self.state: + self.state = self.state["model"] + + self.in_nc: int = self.state["initial.cnn.depthwise.weight"].shape[0] + self.out_nc: int = self.state["final_conv.pointwise.weight"].shape[0] + self.num_filters: int = self.state["initial.cnn.pointwise.weight"].shape[0] + self.num_blocks = len( + set([x.split(".")[1] for x in self.state.keys() if "residual" in x]) + ) + self.scale: int = 2 ** len( + set([x.split(".")[1] for x in self.state.keys() if "upsampler" in x]) + ) + + in_channels = self.in_nc + num_channels = self.num_filters + num_blocks = self.num_blocks + upscale_factor = self.scale + + self.supports_fp16 = True + self.supports_bfp16 = True + self.min_size_restriction = None + + self.initial = ConvBlock( + in_channels, num_channels, kernel_size=9, stride=1, padding=4, use_bn=False + ) + self.residual = nn.Sequential( + *[ResidualBlock(num_channels) for _ in range(num_blocks)] + ) + self.convblock = ConvBlock( + num_channels, + num_channels, + kernel_size=3, + stride=1, + padding=1, + use_act=False, + ) + self.upsampler = nn.Sequential( + *[ + UpsampleBlock(num_channels, scale_factor=2) + for _ in range(upscale_factor // 2) + ] + ) + self.final_conv = SeperableConv2d( + num_channels, in_channels, kernel_size=9, stride=1, padding=4 + ) + + self.load_state_dict(self.state, strict=False) + + def forward(self, x): + initial = self.initial(x) + x = self.residual(initial) + x = self.convblock(x) + initial + x = self.upsampler(x) + return (torch.tanh(self.final_conv(x)) + 1) / 2 diff --git a/ldm_patched/pfn/architecture/Swin2SR.py b/ldm_patched/pfn/architecture/Swin2SR.py new file mode 100644 index 0000000000000000000000000000000000000000..cb57ecfc4ada45a6b087247017732437b1af0fcc --- /dev/null +++ b/ldm_patched/pfn/architecture/Swin2SR.py @@ -0,0 +1,1377 @@ +# pylint: skip-file +# ----------------------------------------------------------------------------------- +# Swin2SR: Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration, https://arxiv.org/abs/2209.11345 +# Written by Conde and Choi et al. +# From: https://raw.githubusercontent.com/mv-lab/swin2sr/main/models/network_swin2sr.py +# ----------------------------------------------------------------------------------- + +import math +import re + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint + +# Originally from the timm package +from .timm.drop import DropPath +from .timm.helpers import to_2tuple +from .timm.weight_init import trunc_normal_ + + +class Mlp(nn.Module): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) + windows = ( + x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + ) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view( + B, H // window_size, W // window_size, window_size, window_size, -1 + ) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowAttention(nn.Module): + r"""Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + pretrained_window_size (tuple[int]): The height and width of the window in pre-training. + """ + + def __init__( + self, + dim, + window_size, + num_heads, + qkv_bias=True, + attn_drop=0.0, + proj_drop=0.0, + pretrained_window_size=[0, 0], + ): + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.pretrained_window_size = pretrained_window_size + self.num_heads = num_heads + + self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True) # type: ignore + + # mlp to generate continuous relative position bias + self.cpb_mlp = nn.Sequential( + nn.Linear(2, 512, bias=True), + nn.ReLU(inplace=True), + nn.Linear(512, num_heads, bias=False), + ) + + # get relative_coords_table + relative_coords_h = torch.arange( + -(self.window_size[0] - 1), self.window_size[0], dtype=torch.float32 + ) + relative_coords_w = torch.arange( + -(self.window_size[1] - 1), self.window_size[1], dtype=torch.float32 + ) + relative_coords_table = ( + torch.stack(torch.meshgrid([relative_coords_h, relative_coords_w])) + .permute(1, 2, 0) + .contiguous() + .unsqueeze(0) + ) # 1, 2*Wh-1, 2*Ww-1, 2 + if pretrained_window_size[0] > 0: + relative_coords_table[:, :, :, 0] /= pretrained_window_size[0] - 1 + relative_coords_table[:, :, :, 1] /= pretrained_window_size[1] - 1 + else: + relative_coords_table[:, :, :, 0] /= self.window_size[0] - 1 + relative_coords_table[:, :, :, 1] /= self.window_size[1] - 1 + relative_coords_table *= 8 # normalize to -8, 8 + relative_coords_table = ( + torch.sign(relative_coords_table) + * torch.log2(torch.abs(relative_coords_table) + 1.0) + / np.log2(8) + ) + + self.register_buffer("relative_coords_table", relative_coords_table) + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = ( + coords_flatten[:, :, None] - coords_flatten[:, None, :] + ) # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute( + 1, 2, 0 + ).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias=False) + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(dim)) # type: ignore + self.v_bias = nn.Parameter(torch.zeros(dim)) # type: ignore + else: + self.q_bias = None + self.v_bias = None + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask=None): + """ + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv_bias = None + if self.q_bias is not None: + qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias)) # type: ignore + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape(B_, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = ( + qkv[0], + qkv[1], + qkv[2], + ) # make torchscript happy (cannot use tensor as tuple) + + # cosine attention + attn = F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1) + logit_scale = torch.clamp( + self.logit_scale, + max=torch.log(torch.tensor(1.0 / 0.01)).to(self.logit_scale.device), + ).exp() + attn = attn * logit_scale + + relative_position_bias_table = self.cpb_mlp(self.relative_coords_table).view( + -1, self.num_heads + ) + relative_position_bias = relative_position_bias_table[self.relative_position_index.view(-1)].view( # type: ignore + self.window_size[0] * self.window_size[1], + self.window_size[0] * self.window_size[1], + -1, + ) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1 + ).contiguous() # nH, Wh*Ww, Wh*Ww + relative_position_bias = 16 * torch.sigmoid(relative_position_bias) + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze( + 1 + ).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + def extra_repr(self) -> str: + return ( + f"dim={self.dim}, window_size={self.window_size}, " + f"pretrained_window_size={self.pretrained_window_size}, num_heads={self.num_heads}" + ) + + def flops(self, N): + # calculate flops for 1 window with token length of N + flops = 0 + # qkv = self.qkv(x) + flops += N * self.dim * 3 * self.dim + # attn = (q @ k.transpose(-2, -1)) + flops += self.num_heads * N * (self.dim // self.num_heads) * N + # x = (attn @ v) + flops += self.num_heads * N * N * (self.dim // self.num_heads) + # x = self.proj(x) + flops += N * self.dim * self.dim + return flops + + +class SwinTransformerBlock(nn.Module): + r"""Swin Transformer Block. + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resulotion. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + pretrained_window_size (int): Window size in pre-training. + """ + + def __init__( + self, + dim, + input_resolution, + num_heads, + window_size=7, + shift_size=0, + mlp_ratio=4.0, + qkv_bias=True, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + pretrained_window_size=0, + ): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + if min(self.input_resolution) <= self.window_size: + # if window size is larger than input resolution, we don't partition windows + self.shift_size = 0 + self.window_size = min(self.input_resolution) + assert ( + 0 <= self.shift_size < self.window_size + ), "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, + window_size=to_2tuple(self.window_size), + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop, + pretrained_window_size=to_2tuple(pretrained_window_size), + ) + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + if self.shift_size > 0: + attn_mask = self.calculate_mask(self.input_resolution) + else: + attn_mask = None + + self.register_buffer("attn_mask", attn_mask) + + def calculate_mask(self, x_size): + # calculate attention mask for SW-MSA + H, W = x_size + img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1 + h_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + w_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition( + img_mask, self.window_size + ) # nW, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( + attn_mask == 0, float(0.0) + ) + + return attn_mask + + def forward(self, x, x_size): + H, W = x_size + B, L, C = x.shape + # assert L == H * W, "input feature has wrong size" + + shortcut = x + x = x.view(B, H, W, C) + + # cyclic shift + if self.shift_size > 0: + shifted_x = torch.roll( + x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2) + ) + else: + shifted_x = x + + # partition windows + x_windows = window_partition( + shifted_x, self.window_size + ) # nW*B, window_size, window_size, C + x_windows = x_windows.view( + -1, self.window_size * self.window_size, C + ) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA (to be compatible for testing on images whose shapes are the multiple of window size + if self.input_resolution == x_size: + attn_windows = self.attn( + x_windows, mask=self.attn_mask + ) # nW*B, window_size*window_size, C + else: + attn_windows = self.attn( + x_windows, mask=self.calculate_mask(x_size).to(x.device) + ) + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) + shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll( + shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2) + ) + else: + x = shifted_x + x = x.view(B, H * W, C) + x = shortcut + self.drop_path(self.norm1(x)) + + # FFN + x = x + self.drop_path(self.norm2(self.mlp(x))) + + return x + + def extra_repr(self) -> str: + return ( + f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " + f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}" + ) + + def flops(self): + flops = 0 + H, W = self.input_resolution + # norm1 + flops += self.dim * H * W + # W-MSA/SW-MSA + nW = H * W / self.window_size / self.window_size + flops += nW * self.attn.flops(self.window_size * self.window_size) + # mlp + flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio + # norm2 + flops += self.dim * H * W + return flops + + +class PatchMerging(nn.Module): + r"""Patch Merging Layer. + Args: + input_resolution (tuple[int]): Resolution of input feature. + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(2 * dim) + + def forward(self, x): + """ + x: B, H*W, C + """ + H, W = self.input_resolution + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even." + + x = x.view(B, H, W, C) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C + + x = self.reduction(x) + x = self.norm(x) + + return x + + def extra_repr(self) -> str: + return f"input_resolution={self.input_resolution}, dim={self.dim}" + + def flops(self): + H, W = self.input_resolution + flops = (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim + flops += H * W * self.dim // 2 + return flops + + +class BasicLayer(nn.Module): + """A basic Swin Transformer layer for one stage. + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + window_size (int): Local window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + pretrained_window_size (int): Local window size in pre-training. + """ + + def __init__( + self, + dim, + input_resolution, + depth, + num_heads, + window_size, + mlp_ratio=4.0, + qkv_bias=True, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False, + pretrained_window_size=0, + ): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList( + [ + SwinTransformerBlock( + dim=dim, + input_resolution=input_resolution, + num_heads=num_heads, + window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] + if isinstance(drop_path, list) + else drop_path, + norm_layer=norm_layer, + pretrained_window_size=pretrained_window_size, + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample( + input_resolution, dim=dim, norm_layer=norm_layer + ) + else: + self.downsample = None + + def forward(self, x, x_size): + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x, x_size) + else: + x = blk(x, x_size) + if self.downsample is not None: + x = self.downsample(x) + return x + + def extra_repr(self) -> str: + return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}" + + def flops(self): + flops = 0 + for blk in self.blocks: + flops += blk.flops() # type: ignore + if self.downsample is not None: + flops += self.downsample.flops() + return flops + + def _init_respostnorm(self): + for blk in self.blocks: + nn.init.constant_(blk.norm1.bias, 0) # type: ignore + nn.init.constant_(blk.norm1.weight, 0) # type: ignore + nn.init.constant_(blk.norm2.bias, 0) # type: ignore + nn.init.constant_(blk.norm2.weight, 0) # type: ignore + + +class PatchEmbed(nn.Module): + r"""Image to Patch Embedding + Args: + img_size (int): Image size. Default: 224. + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__( + self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None + ): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]] # type: ignore + self.img_size = img_size + self.patch_size = patch_size + self.patches_resolution = patches_resolution + self.num_patches = patches_resolution[0] * patches_resolution[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size # type: ignore + ) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + B, C, H, W = x.shape + # FIXME look at relaxing size constraints + # assert H == self.img_size[0] and W == self.img_size[1], + # f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x).flatten(2).transpose(1, 2) # B Ph*Pw C + if self.norm is not None: + x = self.norm(x) + return x + + def flops(self): + Ho, Wo = self.patches_resolution + flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) # type: ignore + if self.norm is not None: + flops += Ho * Wo * self.embed_dim + return flops + + +class RSTB(nn.Module): + """Residual Swin Transformer Block (RSTB). + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + window_size (int): Local window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + img_size: Input image size. + patch_size: Patch size. + resi_connection: The convolutional block before residual connection. + """ + + def __init__( + self, + dim, + input_resolution, + depth, + num_heads, + window_size, + mlp_ratio=4.0, + qkv_bias=True, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False, + img_size=224, + patch_size=4, + resi_connection="1conv", + ): + super(RSTB, self).__init__() + + self.dim = dim + self.input_resolution = input_resolution + + self.residual_group = BasicLayer( + dim=dim, + input_resolution=input_resolution, + depth=depth, + num_heads=num_heads, + window_size=window_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path, + norm_layer=norm_layer, + downsample=downsample, + use_checkpoint=use_checkpoint, + ) + + if resi_connection == "1conv": + self.conv = nn.Conv2d(dim, dim, 3, 1, 1) + elif resi_connection == "3conv": + # to save parameters and memory + self.conv = nn.Sequential( + nn.Conv2d(dim, dim // 4, 3, 1, 1), + nn.LeakyReLU(negative_slope=0.2, inplace=True), + nn.Conv2d(dim // 4, dim // 4, 1, 1, 0), + nn.LeakyReLU(negative_slope=0.2, inplace=True), + nn.Conv2d(dim // 4, dim, 3, 1, 1), + ) + + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=dim, + embed_dim=dim, + norm_layer=None, + ) + + self.patch_unembed = PatchUnEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=dim, + embed_dim=dim, + norm_layer=None, + ) + + def forward(self, x, x_size): + return ( + self.patch_embed( + self.conv(self.patch_unembed(self.residual_group(x, x_size), x_size)) + ) + + x + ) + + def flops(self): + flops = 0 + flops += self.residual_group.flops() + H, W = self.input_resolution + flops += H * W * self.dim * self.dim * 9 + flops += self.patch_embed.flops() + flops += self.patch_unembed.flops() + + return flops + + +class PatchUnEmbed(nn.Module): + r"""Image to Patch Unembedding + + Args: + img_size (int): Image size. Default: 224. + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__( + self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None + ): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]] # type: ignore + self.img_size = img_size + self.patch_size = patch_size + self.patches_resolution = patches_resolution + self.num_patches = patches_resolution[0] * patches_resolution[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + def forward(self, x, x_size): + B, HW, C = x.shape + x = x.transpose(1, 2).view(B, self.embed_dim, x_size[0], x_size[1]) # B Ph*Pw C + return x + + def flops(self): + flops = 0 + return flops + + +class Upsample(nn.Sequential): + """Upsample module. + + Args: + scale (int): Scale factor. Supported scales: 2^n and 3. + num_feat (int): Channel number of intermediate features. + """ + + def __init__(self, scale, num_feat): + m = [] + if (scale & (scale - 1)) == 0: # scale = 2^n + for _ in range(int(math.log(scale, 2))): + m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1)) + m.append(nn.PixelShuffle(2)) + elif scale == 3: + m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1)) + m.append(nn.PixelShuffle(3)) + else: + raise ValueError( + f"scale {scale} is not supported. " "Supported scales: 2^n and 3." + ) + super(Upsample, self).__init__(*m) + + +class Upsample_hf(nn.Sequential): + """Upsample module. + + Args: + scale (int): Scale factor. Supported scales: 2^n and 3. + num_feat (int): Channel number of intermediate features. + """ + + def __init__(self, scale, num_feat): + m = [] + if (scale & (scale - 1)) == 0: # scale = 2^n + for _ in range(int(math.log(scale, 2))): + m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1)) + m.append(nn.PixelShuffle(2)) + elif scale == 3: + m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1)) + m.append(nn.PixelShuffle(3)) + else: + raise ValueError( + f"scale {scale} is not supported. " "Supported scales: 2^n and 3." + ) + super(Upsample_hf, self).__init__(*m) + + +class UpsampleOneStep(nn.Sequential): + """UpsampleOneStep module (the difference with Upsample is that it always only has 1conv + 1pixelshuffle) + Used in lightweight SR to save parameters. + + Args: + scale (int): Scale factor. Supported scales: 2^n and 3. + num_feat (int): Channel number of intermediate features. + + """ + + def __init__(self, scale, num_feat, num_out_ch, input_resolution=None): + self.num_feat = num_feat + self.input_resolution = input_resolution + m = [] + m.append(nn.Conv2d(num_feat, (scale**2) * num_out_ch, 3, 1, 1)) + m.append(nn.PixelShuffle(scale)) + super(UpsampleOneStep, self).__init__(*m) + + def flops(self): + H, W = self.input_resolution # type: ignore + flops = H * W * self.num_feat * 3 * 9 + return flops + + +class Swin2SR(nn.Module): + r"""Swin2SR + A PyTorch impl of : `Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration`. + + Args: + img_size (int | tuple(int)): Input image size. Default 64 + patch_size (int | tuple(int)): Patch size. Default: 1 + in_chans (int): Number of input image channels. Default: 3 + embed_dim (int): Patch embedding dimension. Default: 96 + depths (tuple(int)): Depth of each Swin Transformer layer. + num_heads (tuple(int)): Number of attention heads in different layers. + window_size (int): Window size. Default: 7 + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + drop_rate (float): Dropout rate. Default: 0 + attn_drop_rate (float): Attention dropout rate. Default: 0 + drop_path_rate (float): Stochastic depth rate. Default: 0.1 + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False + patch_norm (bool): If True, add normalization after patch embedding. Default: True + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False + upscale: Upscale factor. 2/3/4/8 for image SR, 1 for denoising and compress artifact reduction + img_range: Image range. 1. or 255. + upsampler: The reconstruction reconstruction module. 'pixelshuffle'/'pixelshuffledirect'/'nearest+conv'/None + resi_connection: The convolutional block before residual connection. '1conv'/'3conv' + """ + + def __init__( + self, + state_dict, + **kwargs, + ): + super(Swin2SR, self).__init__() + + # Defaults + img_size = 128 + patch_size = 1 + in_chans = 3 + embed_dim = 96 + depths = [6, 6, 6, 6] + num_heads = [6, 6, 6, 6] + window_size = 7 + mlp_ratio = 4.0 + qkv_bias = True + drop_rate = 0.0 + attn_drop_rate = 0.0 + drop_path_rate = 0.1 + norm_layer = nn.LayerNorm + ape = False + patch_norm = True + use_checkpoint = False + upscale = 2 + img_range = 1.0 + upsampler = "" + resi_connection = "1conv" + num_in_ch = in_chans + num_out_ch = in_chans + num_feat = 64 + + self.model_arch = "Swin2SR" + self.sub_type = "SR" + self.state = state_dict + if "params_ema" in self.state: + self.state = self.state["params_ema"] + elif "params" in self.state: + self.state = self.state["params"] + + state_keys = self.state.keys() + + if "conv_before_upsample.0.weight" in state_keys: + if "conv_aux.weight" in state_keys: + upsampler = "pixelshuffle_aux" + elif "conv_up1.weight" in state_keys: + upsampler = "nearest+conv" + else: + upsampler = "pixelshuffle" + supports_fp16 = False + elif "upsample.0.weight" in state_keys: + upsampler = "pixelshuffledirect" + else: + upsampler = "" + + num_feat = ( + self.state.get("conv_before_upsample.0.weight", None).shape[1] + if self.state.get("conv_before_upsample.weight", None) + else 64 + ) + + num_in_ch = self.state["conv_first.weight"].shape[1] + in_chans = num_in_ch + if "conv_last.weight" in state_keys: + num_out_ch = self.state["conv_last.weight"].shape[0] + else: + num_out_ch = num_in_ch + + upscale = 1 + if upsampler == "nearest+conv": + upsample_keys = [ + x for x in state_keys if "conv_up" in x and "bias" not in x + ] + + for upsample_key in upsample_keys: + upscale *= 2 + elif upsampler == "pixelshuffle" or upsampler == "pixelshuffle_aux": + upsample_keys = [ + x + for x in state_keys + if "upsample" in x and "conv" not in x and "bias" not in x + ] + for upsample_key in upsample_keys: + shape = self.state[upsample_key].shape[0] + upscale *= math.sqrt(shape // num_feat) + upscale = int(upscale) + elif upsampler == "pixelshuffledirect": + upscale = int( + math.sqrt(self.state["upsample.0.bias"].shape[0] // num_out_ch) + ) + + max_layer_num = 0 + max_block_num = 0 + for key in state_keys: + result = re.match( + r"layers.(\d*).residual_group.blocks.(\d*).norm1.weight", key + ) + if result: + layer_num, block_num = result.groups() + max_layer_num = max(max_layer_num, int(layer_num)) + max_block_num = max(max_block_num, int(block_num)) + + depths = [max_block_num + 1 for _ in range(max_layer_num + 1)] + + if ( + "layers.0.residual_group.blocks.0.attn.relative_position_bias_table" + in state_keys + ): + num_heads_num = self.state[ + "layers.0.residual_group.blocks.0.attn.relative_position_bias_table" + ].shape[-1] + num_heads = [num_heads_num for _ in range(max_layer_num + 1)] + else: + num_heads = depths + + embed_dim = self.state["conv_first.weight"].shape[0] + + mlp_ratio = float( + self.state["layers.0.residual_group.blocks.0.mlp.fc1.bias"].shape[0] + / embed_dim + ) + + # TODO: could actually count the layers, but this should do + if "layers.0.conv.4.weight" in state_keys: + resi_connection = "3conv" + else: + resi_connection = "1conv" + + window_size = int( + math.sqrt( + self.state[ + "layers.0.residual_group.blocks.0.attn.relative_position_index" + ].shape[0] + ) + ) + + if "layers.0.residual_group.blocks.1.attn_mask" in state_keys: + img_size = int( + math.sqrt( + self.state["layers.0.residual_group.blocks.1.attn_mask"].shape[0] + ) + * window_size + ) + + # The JPEG models are the only ones with window-size 7, and they also use this range + img_range = 255.0 if window_size == 7 else 1.0 + + self.in_nc = num_in_ch + self.out_nc = num_out_ch + self.num_feat = num_feat + self.embed_dim = embed_dim + self.num_heads = num_heads + self.depths = depths + self.window_size = window_size + self.mlp_ratio = mlp_ratio + self.scale = upscale + self.upsampler = upsampler + self.img_size = img_size + self.img_range = img_range + self.resi_connection = resi_connection + + self.supports_fp16 = False # Too much weirdness to support this at the moment + self.supports_bfp16 = True + self.min_size_restriction = 16 + + ## END AUTO DETECTION + + if in_chans == 3: + rgb_mean = (0.4488, 0.4371, 0.4040) + self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1) + else: + self.mean = torch.zeros(1, 1, 1, 1) + self.upscale = upscale + self.upsampler = upsampler + self.window_size = window_size + + ##################################################################################################### + ################################### 1, shallow feature extraction ################################### + self.conv_first = nn.Conv2d(num_in_ch, embed_dim, 3, 1, 1) + + ##################################################################################################### + ################################### 2, deep feature extraction ###################################### + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.ape = ape + self.patch_norm = patch_norm + self.num_features = embed_dim + self.mlp_ratio = mlp_ratio + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=embed_dim, + embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None, + ) + num_patches = self.patch_embed.num_patches + patches_resolution = self.patch_embed.patches_resolution + self.patches_resolution = patches_resolution + + # merge non-overlapping patches into image + self.patch_unembed = PatchUnEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=embed_dim, + embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None, + ) + + # absolute position embedding + if self.ape: + self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) # type: ignore + trunc_normal_(self.absolute_pos_embed, std=0.02) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) + ] # stochastic depth decay rule + + # build Residual Swin Transformer blocks (RSTB) + self.layers = nn.ModuleList() + for i_layer in range(self.num_layers): + layer = RSTB( + dim=embed_dim, + input_resolution=(patches_resolution[0], patches_resolution[1]), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=self.mlp_ratio, + qkv_bias=qkv_bias, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])], # type: ignore # no impact on SR results + norm_layer=norm_layer, + downsample=None, + use_checkpoint=use_checkpoint, + img_size=img_size, + patch_size=patch_size, + resi_connection=resi_connection, + ) + self.layers.append(layer) + + if self.upsampler == "pixelshuffle_hf": + self.layers_hf = nn.ModuleList() + for i_layer in range(self.num_layers): + layer = RSTB( + dim=embed_dim, + input_resolution=(patches_resolution[0], patches_resolution[1]), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=self.mlp_ratio, + qkv_bias=qkv_bias, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])], # type: ignore # no impact on SR results # type: ignore + norm_layer=norm_layer, + downsample=None, + use_checkpoint=use_checkpoint, + img_size=img_size, + patch_size=patch_size, + resi_connection=resi_connection, + ) + self.layers_hf.append(layer) + + self.norm = norm_layer(self.num_features) + + # build the last conv layer in deep feature extraction + if resi_connection == "1conv": + self.conv_after_body = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1) + elif resi_connection == "3conv": + # to save parameters and memory + self.conv_after_body = nn.Sequential( + nn.Conv2d(embed_dim, embed_dim // 4, 3, 1, 1), + nn.LeakyReLU(negative_slope=0.2, inplace=True), + nn.Conv2d(embed_dim // 4, embed_dim // 4, 1, 1, 0), + nn.LeakyReLU(negative_slope=0.2, inplace=True), + nn.Conv2d(embed_dim // 4, embed_dim, 3, 1, 1), + ) + + ##################################################################################################### + ################################ 3, high quality image reconstruction ################################ + if self.upsampler == "pixelshuffle": + # for classical SR + self.conv_before_upsample = nn.Sequential( + nn.Conv2d(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True) + ) + self.upsample = Upsample(upscale, num_feat) + self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1) + elif self.upsampler == "pixelshuffle_aux": + self.conv_bicubic = nn.Conv2d(num_in_ch, num_feat, 3, 1, 1) + self.conv_before_upsample = nn.Sequential( + nn.Conv2d(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True) + ) + self.conv_aux = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1) + self.conv_after_aux = nn.Sequential( + nn.Conv2d(3, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True) + ) + self.upsample = Upsample(upscale, num_feat) + self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1) + + elif self.upsampler == "pixelshuffle_hf": + self.conv_before_upsample = nn.Sequential( + nn.Conv2d(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True) + ) + self.upsample = Upsample(upscale, num_feat) + self.upsample_hf = Upsample_hf(upscale, num_feat) + self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1) + self.conv_first_hf = nn.Sequential( + nn.Conv2d(num_feat, embed_dim, 3, 1, 1), nn.LeakyReLU(inplace=True) + ) + self.conv_after_body_hf = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1) + self.conv_before_upsample_hf = nn.Sequential( + nn.Conv2d(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True) + ) + self.conv_last_hf = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1) + + elif self.upsampler == "pixelshuffledirect": + # for lightweight SR (to save parameters) + self.upsample = UpsampleOneStep( + upscale, + embed_dim, + num_out_ch, + (patches_resolution[0], patches_resolution[1]), + ) + elif self.upsampler == "nearest+conv": + # for real-world SR (less artifacts) + assert self.upscale == 4, "only support x4 now." + self.conv_before_upsample = nn.Sequential( + nn.Conv2d(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True) + ) + self.conv_up1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1) + self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1) + self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1) + self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1) + self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True) + else: + # for image denoising and JPEG compression artifact reduction + self.conv_last = nn.Conv2d(embed_dim, num_out_ch, 3, 1, 1) + + self.apply(self._init_weights) + + self.load_state_dict(state_dict) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore # type: ignore + def no_weight_decay(self): + return {"absolute_pos_embed"} + + @torch.jit.ignore # type: ignore + def no_weight_decay_keywords(self): + return {"relative_position_bias_table"} + + def check_image_size(self, x): + _, _, h, w = x.size() + mod_pad_h = (self.window_size - h % self.window_size) % self.window_size + mod_pad_w = (self.window_size - w % self.window_size) % self.window_size + x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h), "reflect") + return x + + def forward_features(self, x): + x_size = (x.shape[2], x.shape[3]) + x = self.patch_embed(x) + if self.ape: + x = x + self.absolute_pos_embed + x = self.pos_drop(x) + + for layer in self.layers: + x = layer(x, x_size) + + x = self.norm(x) # B L C + x = self.patch_unembed(x, x_size) + + return x + + def forward_features_hf(self, x): + x_size = (x.shape[2], x.shape[3]) + x = self.patch_embed(x) + if self.ape: + x = x + self.absolute_pos_embed + x = self.pos_drop(x) + + for layer in self.layers_hf: + x = layer(x, x_size) + + x = self.norm(x) # B L C + x = self.patch_unembed(x, x_size) + + return x + + def forward(self, x): + H, W = x.shape[2:] + x = self.check_image_size(x) + + self.mean = self.mean.type_as(x) + x = (x - self.mean) * self.img_range + + if self.upsampler == "pixelshuffle": + # for classical SR + x = self.conv_first(x) + x = self.conv_after_body(self.forward_features(x)) + x + x = self.conv_before_upsample(x) + x = self.conv_last(self.upsample(x)) + elif self.upsampler == "pixelshuffle_aux": + bicubic = F.interpolate( + x, + size=(H * self.upscale, W * self.upscale), + mode="bicubic", + align_corners=False, + ) + bicubic = self.conv_bicubic(bicubic) + x = self.conv_first(x) + x = self.conv_after_body(self.forward_features(x)) + x + x = self.conv_before_upsample(x) + aux = self.conv_aux(x) # b, 3, LR_H, LR_W + x = self.conv_after_aux(aux) + x = ( + self.upsample(x)[:, :, : H * self.upscale, : W * self.upscale] + + bicubic[:, :, : H * self.upscale, : W * self.upscale] + ) + x = self.conv_last(x) + aux = aux / self.img_range + self.mean + elif self.upsampler == "pixelshuffle_hf": + # for classical SR with HF + x = self.conv_first(x) + x = self.conv_after_body(self.forward_features(x)) + x + x_before = self.conv_before_upsample(x) + x_out = self.conv_last(self.upsample(x_before)) + + x_hf = self.conv_first_hf(x_before) + x_hf = self.conv_after_body_hf(self.forward_features_hf(x_hf)) + x_hf + x_hf = self.conv_before_upsample_hf(x_hf) + x_hf = self.conv_last_hf(self.upsample_hf(x_hf)) + x = x_out + x_hf + x_hf = x_hf / self.img_range + self.mean + + elif self.upsampler == "pixelshuffledirect": + # for lightweight SR + x = self.conv_first(x) + x = self.conv_after_body(self.forward_features(x)) + x + x = self.upsample(x) + elif self.upsampler == "nearest+conv": + # for real-world SR + x = self.conv_first(x) + x = self.conv_after_body(self.forward_features(x)) + x + x = self.conv_before_upsample(x) + x = self.lrelu( + self.conv_up1( + torch.nn.functional.interpolate(x, scale_factor=2, mode="nearest") + ) + ) + x = self.lrelu( + self.conv_up2( + torch.nn.functional.interpolate(x, scale_factor=2, mode="nearest") + ) + ) + x = self.conv_last(self.lrelu(self.conv_hr(x))) + else: + # for image denoising and JPEG compression artifact reduction + x_first = self.conv_first(x) + res = self.conv_after_body(self.forward_features(x_first)) + x_first + x = x + self.conv_last(res) + + x = x / self.img_range + self.mean + if self.upsampler == "pixelshuffle_aux": + # NOTE: I removed an "aux" output here. not sure what that was for + return x[:, :, : H * self.upscale, : W * self.upscale] # type: ignore + + elif self.upsampler == "pixelshuffle_hf": + x_out = x_out / self.img_range + self.mean # type: ignore + return x_out[:, :, : H * self.upscale, : W * self.upscale], x[:, :, : H * self.upscale, : W * self.upscale], x_hf[:, :, : H * self.upscale, : W * self.upscale] # type: ignore + + else: + return x[:, :, : H * self.upscale, : W * self.upscale] + + def flops(self): + flops = 0 + H, W = self.patches_resolution + flops += H * W * 3 * self.embed_dim * 9 + flops += self.patch_embed.flops() + for i, layer in enumerate(self.layers): + flops += layer.flops() # type: ignore + flops += H * W * 3 * self.embed_dim * self.embed_dim + flops += self.upsample.flops() # type: ignore + return flops diff --git a/ldm_patched/pfn/architecture/SwinIR.py b/ldm_patched/pfn/architecture/SwinIR.py new file mode 100644 index 0000000000000000000000000000000000000000..439dcbcb2b12f7ff27a01490f4c2ae7b6e4eab9e --- /dev/null +++ b/ldm_patched/pfn/architecture/SwinIR.py @@ -0,0 +1,1224 @@ +# pylint: skip-file +# ----------------------------------------------------------------------------------- +# SwinIR: Image Restoration Using Swin Transformer, https://arxiv.org/abs/2108.10257 +# Originally Written by Ze Liu, Modified by Jingyun Liang. +# ----------------------------------------------------------------------------------- + +import math +import re + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint + +# Originally from the timm package +from .timm.drop import DropPath +from .timm.helpers import to_2tuple +from .timm.weight_init import trunc_normal_ + + +class Mlp(nn.Module): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) + windows = ( + x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + ) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view( + B, H // window_size, W // window_size, window_size, window_size, -1 + ) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowAttention(nn.Module): + r"""Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__( + self, + dim, + window_size, + num_heads, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( # type: ignore + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads) + ) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = ( + coords_flatten[:, :, None] - coords_flatten[:, None, :] + ) # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute( + 1, 2, 0 + ).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=0.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask=None): + """ + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape(B_, N, 3, self.num_heads, C // self.num_heads) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = ( + qkv[0], + qkv[1], + qkv[2], + ) # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1) # type: ignore + ].view( + self.window_size[0] * self.window_size[1], + self.window_size[0] * self.window_size[1], + -1, + ) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1 + ).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze( + 1 + ).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + def extra_repr(self) -> str: + return f"dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}" + + def flops(self, N): + # calculate flops for 1 window with token length of N + flops = 0 + # qkv = self.qkv(x) + flops += N * self.dim * 3 * self.dim + # attn = (q @ k.transpose(-2, -1)) + flops += self.num_heads * N * (self.dim // self.num_heads) * N + # x = (attn @ v) + flops += self.num_heads * N * N * (self.dim // self.num_heads) + # x = self.proj(x) + flops += N * self.dim * self.dim + return flops + + +class SwinTransformerBlock(nn.Module): + r"""Swin Transformer Block. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resulotion. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__( + self, + dim, + input_resolution, + num_heads, + window_size=7, + shift_size=0, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + if min(self.input_resolution) <= self.window_size: + # if window size is larger than input resolution, we don't partition windows + self.shift_size = 0 + self.window_size = min(self.input_resolution) + assert ( + 0 <= self.shift_size < self.window_size + ), "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, + window_size=to_2tuple(self.window_size), + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + if self.shift_size > 0: + attn_mask = self.calculate_mask(self.input_resolution) + else: + attn_mask = None + + self.register_buffer("attn_mask", attn_mask) + + def calculate_mask(self, x_size): + # calculate attention mask for SW-MSA + H, W = x_size + img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1 + h_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + w_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition( + img_mask, self.window_size + ) # nW, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( + attn_mask == 0, float(0.0) + ) + + return attn_mask + + def forward(self, x, x_size): + H, W = x_size + B, L, C = x.shape + # assert L == H * W, "input feature has wrong size" + + shortcut = x + x = self.norm1(x) + x = x.view(B, H, W, C) + + # cyclic shift + if self.shift_size > 0: + shifted_x = torch.roll( + x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2) + ) + else: + shifted_x = x + + # partition windows + x_windows = window_partition( + shifted_x, self.window_size + ) # nW*B, window_size, window_size, C + x_windows = x_windows.view( + -1, self.window_size * self.window_size, C + ) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA (to be compatible for testing on images whose shapes are the multiple of window size + if self.input_resolution == x_size: + attn_windows = self.attn( + x_windows, mask=self.attn_mask + ) # nW*B, window_size*window_size, C + else: + attn_windows = self.attn( + x_windows, mask=self.calculate_mask(x_size).to(x.device) + ) + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) + shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll( + shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2) + ) + else: + x = shifted_x + x = x.view(B, H * W, C) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + def extra_repr(self) -> str: + return ( + f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " + f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}" + ) + + def flops(self): + flops = 0 + H, W = self.input_resolution + # norm1 + flops += self.dim * H * W + # W-MSA/SW-MSA + nW = H * W / self.window_size / self.window_size + flops += nW * self.attn.flops(self.window_size * self.window_size) + # mlp + flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio + # norm2 + flops += self.dim * H * W + return flops + + +class PatchMerging(nn.Module): + r"""Patch Merging Layer. + + Args: + input_resolution (tuple[int]): Resolution of input feature. + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x): + """ + x: B, H*W, C + """ + H, W = self.input_resolution + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even." + + x = x.view(B, H, W, C) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + def extra_repr(self) -> str: + return f"input_resolution={self.input_resolution}, dim={self.dim}" + + def flops(self): + H, W = self.input_resolution + flops = H * W * self.dim + flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim + return flops + + +class BasicLayer(nn.Module): + """A basic Swin Transformer layer for one stage. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + window_size (int): Local window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__( + self, + dim, + input_resolution, + depth, + num_heads, + window_size, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False, + ): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList( + [ + SwinTransformerBlock( + dim=dim, + input_resolution=input_resolution, + num_heads=num_heads, + window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] + if isinstance(drop_path, list) + else drop_path, + norm_layer=norm_layer, + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample( + input_resolution, dim=dim, norm_layer=norm_layer + ) + else: + self.downsample = None + + def forward(self, x, x_size): + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x, x_size) + else: + x = blk(x, x_size) + if self.downsample is not None: + x = self.downsample(x) + return x + + def extra_repr(self) -> str: + return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}" + + def flops(self): + flops = 0 + for blk in self.blocks: + flops += blk.flops() # type: ignore + if self.downsample is not None: + flops += self.downsample.flops() + return flops + + +class RSTB(nn.Module): + """Residual Swin Transformer Block (RSTB). + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + window_size (int): Local window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + img_size: Input image size. + patch_size: Patch size. + resi_connection: The convolutional block before residual connection. + """ + + def __init__( + self, + dim, + input_resolution, + depth, + num_heads, + window_size, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False, + img_size=224, + patch_size=4, + resi_connection="1conv", + ): + super(RSTB, self).__init__() + + self.dim = dim + self.input_resolution = input_resolution + + self.residual_group = BasicLayer( + dim=dim, + input_resolution=input_resolution, + depth=depth, + num_heads=num_heads, + window_size=window_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path, + norm_layer=norm_layer, + downsample=downsample, + use_checkpoint=use_checkpoint, + ) + + if resi_connection == "1conv": + self.conv = nn.Conv2d(dim, dim, 3, 1, 1) + elif resi_connection == "3conv": + # to save parameters and memory + self.conv = nn.Sequential( + nn.Conv2d(dim, dim // 4, 3, 1, 1), + nn.LeakyReLU(negative_slope=0.2, inplace=True), + nn.Conv2d(dim // 4, dim // 4, 1, 1, 0), + nn.LeakyReLU(negative_slope=0.2, inplace=True), + nn.Conv2d(dim // 4, dim, 3, 1, 1), + ) + + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=0, + embed_dim=dim, + norm_layer=None, + ) + + self.patch_unembed = PatchUnEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=0, + embed_dim=dim, + norm_layer=None, + ) + + def forward(self, x, x_size): + return ( + self.patch_embed( + self.conv(self.patch_unembed(self.residual_group(x, x_size), x_size)) + ) + + x + ) + + def flops(self): + flops = 0 + flops += self.residual_group.flops() + H, W = self.input_resolution + flops += H * W * self.dim * self.dim * 9 + flops += self.patch_embed.flops() + flops += self.patch_unembed.flops() + + return flops + + +class PatchEmbed(nn.Module): + r"""Image to Patch Embedding + + Args: + img_size (int): Image size. Default: 224. + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__( + self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None + ): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [ + img_size[0] // patch_size[0], # type: ignore + img_size[1] // patch_size[1], # type: ignore + ] + self.img_size = img_size + self.patch_size = patch_size + self.patches_resolution = patches_resolution + self.num_patches = patches_resolution[0] * patches_resolution[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + x = x.flatten(2).transpose(1, 2) # B Ph*Pw C + if self.norm is not None: + x = self.norm(x) + return x + + def flops(self): + flops = 0 + H, W = self.img_size + if self.norm is not None: + flops += H * W * self.embed_dim # type: ignore + return flops + + +class PatchUnEmbed(nn.Module): + r"""Image to Patch Unembedding + + Args: + img_size (int): Image size. Default: 224. + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__( + self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None + ): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [ + img_size[0] // patch_size[0], # type: ignore + img_size[1] // patch_size[1], # type: ignore + ] + self.img_size = img_size + self.patch_size = patch_size + self.patches_resolution = patches_resolution + self.num_patches = patches_resolution[0] * patches_resolution[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + def forward(self, x, x_size): + B, HW, C = x.shape + x = x.transpose(1, 2).view(B, self.embed_dim, x_size[0], x_size[1]) # B Ph*Pw C + return x + + def flops(self): + flops = 0 + return flops + + +class Upsample(nn.Sequential): + """Upsample module. + + Args: + scale (int): Scale factor. Supported scales: 2^n and 3. + num_feat (int): Channel number of intermediate features. + """ + + def __init__(self, scale, num_feat): + m = [] + if (scale & (scale - 1)) == 0: # scale = 2^n + for _ in range(int(math.log(scale, 2))): + m.append(nn.Conv2d(num_feat, 4 * num_feat, 3, 1, 1)) + m.append(nn.PixelShuffle(2)) + elif scale == 3: + m.append(nn.Conv2d(num_feat, 9 * num_feat, 3, 1, 1)) + m.append(nn.PixelShuffle(3)) + else: + raise ValueError( + f"scale {scale} is not supported. " "Supported scales: 2^n and 3." + ) + super(Upsample, self).__init__(*m) + + +class UpsampleOneStep(nn.Sequential): + """UpsampleOneStep module (the difference with Upsample is that it always only has 1conv + 1pixelshuffle) + Used in lightweight SR to save parameters. + + Args: + scale (int): Scale factor. Supported scales: 2^n and 3. + num_feat (int): Channel number of intermediate features. + + """ + + def __init__(self, scale, num_feat, num_out_ch, input_resolution=None): + self.num_feat = num_feat + self.input_resolution = input_resolution + m = [] + m.append(nn.Conv2d(num_feat, (scale**2) * num_out_ch, 3, 1, 1)) + m.append(nn.PixelShuffle(scale)) + super(UpsampleOneStep, self).__init__(*m) + + def flops(self): + H, W = self.input_resolution # type: ignore + flops = H * W * self.num_feat * 3 * 9 + return flops + + +class SwinIR(nn.Module): + r"""SwinIR + A PyTorch impl of : `SwinIR: Image Restoration Using Swin Transformer`, based on Swin Transformer. + + Args: + img_size (int | tuple(int)): Input image size. Default 64 + patch_size (int | tuple(int)): Patch size. Default: 1 + in_chans (int): Number of input image channels. Default: 3 + embed_dim (int): Patch embedding dimension. Default: 96 + depths (tuple(int)): Depth of each Swin Transformer layer. + num_heads (tuple(int)): Number of attention heads in different layers. + window_size (int): Window size. Default: 7 + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None + drop_rate (float): Dropout rate. Default: 0 + attn_drop_rate (float): Attention dropout rate. Default: 0 + drop_path_rate (float): Stochastic depth rate. Default: 0.1 + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False + patch_norm (bool): If True, add normalization after patch embedding. Default: True + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False + upscale: Upscale factor. 2/3/4/8 for image SR, 1 for denoising and compress artifact reduction + img_range: Image range. 1. or 255. + upsampler: The reconstruction reconstruction module. 'pixelshuffle'/'pixelshuffledirect'/'nearest+conv'/None + resi_connection: The convolutional block before residual connection. '1conv'/'3conv' + """ + + def __init__( + self, + state_dict, + **kwargs, + ): + super(SwinIR, self).__init__() + + # Defaults + img_size = 64 + patch_size = 1 + in_chans = 3 + embed_dim = 96 + depths = [6, 6, 6, 6] + num_heads = [6, 6, 6, 6] + window_size = 7 + mlp_ratio = 4.0 + qkv_bias = True + qk_scale = None + drop_rate = 0.0 + attn_drop_rate = 0.0 + drop_path_rate = 0.1 + norm_layer = nn.LayerNorm + ape = False + patch_norm = True + use_checkpoint = False + upscale = 2 + img_range = 1.0 + upsampler = "" + resi_connection = "1conv" + num_feat = 64 + num_in_ch = in_chans + num_out_ch = in_chans + supports_fp16 = True + self.start_unshuffle = 1 + + self.model_arch = "SwinIR" + self.sub_type = "SR" + self.state = state_dict + if "params_ema" in self.state: + self.state = self.state["params_ema"] + elif "params" in self.state: + self.state = self.state["params"] + + state_keys = self.state.keys() + + if "conv_before_upsample.0.weight" in state_keys: + if "conv_up1.weight" in state_keys: + upsampler = "nearest+conv" + else: + upsampler = "pixelshuffle" + supports_fp16 = False + elif "upsample.0.weight" in state_keys: + upsampler = "pixelshuffledirect" + else: + upsampler = "" + + num_feat = ( + self.state.get("conv_before_upsample.0.weight", None).shape[1] + if self.state.get("conv_before_upsample.weight", None) + else 64 + ) + + if "conv_first.1.weight" in self.state: + self.state["conv_first.weight"] = self.state.pop("conv_first.1.weight") + self.state["conv_first.bias"] = self.state.pop("conv_first.1.bias") + self.start_unshuffle = round(math.sqrt(self.state["conv_first.weight"].shape[1] // 3)) + + num_in_ch = self.state["conv_first.weight"].shape[1] + in_chans = num_in_ch + if "conv_last.weight" in state_keys: + num_out_ch = self.state["conv_last.weight"].shape[0] + else: + num_out_ch = num_in_ch + + upscale = 1 + if upsampler == "nearest+conv": + upsample_keys = [ + x for x in state_keys if "conv_up" in x and "bias" not in x + ] + + for upsample_key in upsample_keys: + upscale *= 2 + elif upsampler == "pixelshuffle": + upsample_keys = [ + x + for x in state_keys + if "upsample" in x and "conv" not in x and "bias" not in x + ] + for upsample_key in upsample_keys: + shape = self.state[upsample_key].shape[0] + upscale *= math.sqrt(shape // num_feat) + upscale = int(upscale) + elif upsampler == "pixelshuffledirect": + upscale = int( + math.sqrt(self.state["upsample.0.bias"].shape[0] // num_out_ch) + ) + + max_layer_num = 0 + max_block_num = 0 + for key in state_keys: + result = re.match( + r"layers.(\d*).residual_group.blocks.(\d*).norm1.weight", key + ) + if result: + layer_num, block_num = result.groups() + max_layer_num = max(max_layer_num, int(layer_num)) + max_block_num = max(max_block_num, int(block_num)) + + depths = [max_block_num + 1 for _ in range(max_layer_num + 1)] + + if ( + "layers.0.residual_group.blocks.0.attn.relative_position_bias_table" + in state_keys + ): + num_heads_num = self.state[ + "layers.0.residual_group.blocks.0.attn.relative_position_bias_table" + ].shape[-1] + num_heads = [num_heads_num for _ in range(max_layer_num + 1)] + else: + num_heads = depths + + embed_dim = self.state["conv_first.weight"].shape[0] + + mlp_ratio = float( + self.state["layers.0.residual_group.blocks.0.mlp.fc1.bias"].shape[0] + / embed_dim + ) + + # TODO: could actually count the layers, but this should do + if "layers.0.conv.4.weight" in state_keys: + resi_connection = "3conv" + else: + resi_connection = "1conv" + + window_size = int( + math.sqrt( + self.state[ + "layers.0.residual_group.blocks.0.attn.relative_position_index" + ].shape[0] + ) + ) + + if "layers.0.residual_group.blocks.1.attn_mask" in state_keys: + img_size = int( + math.sqrt( + self.state["layers.0.residual_group.blocks.1.attn_mask"].shape[0] + ) + * window_size + ) + + # The JPEG models are the only ones with window-size 7, and they also use this range + img_range = 255.0 if window_size == 7 else 1.0 + + self.in_nc = num_in_ch + self.out_nc = num_out_ch + self.num_feat = num_feat + self.embed_dim = embed_dim + self.num_heads = num_heads + self.depths = depths + self.window_size = window_size + self.mlp_ratio = mlp_ratio + self.scale = upscale / self.start_unshuffle + self.upsampler = upsampler + self.img_size = img_size + self.img_range = img_range + self.resi_connection = resi_connection + + self.supports_fp16 = False # Too much weirdness to support this at the moment + self.supports_bfp16 = True + self.min_size_restriction = 16 + + self.img_range = img_range + if in_chans == 3: + rgb_mean = (0.4488, 0.4371, 0.4040) + self.mean = torch.Tensor(rgb_mean).view(1, 3, 1, 1) + else: + self.mean = torch.zeros(1, 1, 1, 1) + self.upscale = upscale + self.upsampler = upsampler + self.window_size = window_size + + ##################################################################################################### + ################################### 1, shallow feature extraction ################################### + self.conv_first = nn.Conv2d(num_in_ch, embed_dim, 3, 1, 1) + + ##################################################################################################### + ################################### 2, deep feature extraction ###################################### + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.ape = ape + self.patch_norm = patch_norm + self.num_features = embed_dim + self.mlp_ratio = mlp_ratio + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=embed_dim, + embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None, + ) + num_patches = self.patch_embed.num_patches + patches_resolution = self.patch_embed.patches_resolution + self.patches_resolution = patches_resolution + + # merge non-overlapping patches into image + self.patch_unembed = PatchUnEmbed( + img_size=img_size, + patch_size=patch_size, + in_chans=embed_dim, + embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None, + ) + + # absolute position embedding + if self.ape: + self.absolute_pos_embed = nn.Parameter( # type: ignore + torch.zeros(1, num_patches, embed_dim) + ) + trunc_normal_(self.absolute_pos_embed, std=0.02) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) + ] # stochastic depth decay rule + + # build Residual Swin Transformer blocks (RSTB) + self.layers = nn.ModuleList() + for i_layer in range(self.num_layers): + layer = RSTB( + dim=embed_dim, + input_resolution=(patches_resolution[0], patches_resolution[1]), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=self.mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[ + sum(depths[:i_layer]) : sum(depths[: i_layer + 1]) # type: ignore + ], # no impact on SR results + norm_layer=norm_layer, + downsample=None, + use_checkpoint=use_checkpoint, + img_size=img_size, + patch_size=patch_size, + resi_connection=resi_connection, + ) + self.layers.append(layer) + self.norm = norm_layer(self.num_features) + + # build the last conv layer in deep feature extraction + if resi_connection == "1conv": + self.conv_after_body = nn.Conv2d(embed_dim, embed_dim, 3, 1, 1) + elif resi_connection == "3conv": + # to save parameters and memory + self.conv_after_body = nn.Sequential( + nn.Conv2d(embed_dim, embed_dim // 4, 3, 1, 1), + nn.LeakyReLU(negative_slope=0.2, inplace=True), + nn.Conv2d(embed_dim // 4, embed_dim // 4, 1, 1, 0), + nn.LeakyReLU(negative_slope=0.2, inplace=True), + nn.Conv2d(embed_dim // 4, embed_dim, 3, 1, 1), + ) + + ##################################################################################################### + ################################ 3, high quality image reconstruction ################################ + if self.upsampler == "pixelshuffle": + # for classical SR + self.conv_before_upsample = nn.Sequential( + nn.Conv2d(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True) + ) + self.upsample = Upsample(upscale, num_feat) + self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1) + elif self.upsampler == "pixelshuffledirect": + # for lightweight SR (to save parameters) + self.upsample = UpsampleOneStep( + upscale, + embed_dim, + num_out_ch, + (patches_resolution[0], patches_resolution[1]), + ) + elif self.upsampler == "nearest+conv": + # for real-world SR (less artifacts) + self.conv_before_upsample = nn.Sequential( + nn.Conv2d(embed_dim, num_feat, 3, 1, 1), nn.LeakyReLU(inplace=True) + ) + self.conv_up1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1) + if self.upscale == 4: + self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1) + elif self.upscale == 8: + self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1) + self.conv_up3 = nn.Conv2d(num_feat, num_feat, 3, 1, 1) + self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1) + self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1) + self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True) + else: + # for image denoising and JPEG compression artifact reduction + self.conv_last = nn.Conv2d(embed_dim, num_out_ch, 3, 1, 1) + + self.apply(self._init_weights) + self.load_state_dict(self.state, strict=False) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore # type: ignore + def no_weight_decay(self): + return {"absolute_pos_embed"} + + @torch.jit.ignore # type: ignore + def no_weight_decay_keywords(self): + return {"relative_position_bias_table"} + + def check_image_size(self, x): + _, _, h, w = x.size() + mod_pad_h = (self.window_size - h % self.window_size) % self.window_size + mod_pad_w = (self.window_size - w % self.window_size) % self.window_size + x = F.pad(x, (0, mod_pad_w, 0, mod_pad_h), "reflect") + return x + + def forward_features(self, x): + x_size = (x.shape[2], x.shape[3]) + x = self.patch_embed(x) + if self.ape: + x = x + self.absolute_pos_embed + x = self.pos_drop(x) + + for layer in self.layers: + x = layer(x, x_size) + + x = self.norm(x) # B L C + x = self.patch_unembed(x, x_size) + + return x + + def forward(self, x): + H, W = x.shape[2:] + x = self.check_image_size(x) + + self.mean = self.mean.type_as(x) + x = (x - self.mean) * self.img_range + + if self.start_unshuffle > 1: + x = torch.nn.functional.pixel_unshuffle(x, self.start_unshuffle) + + if self.upsampler == "pixelshuffle": + # for classical SR + x = self.conv_first(x) + x = self.conv_after_body(self.forward_features(x)) + x + x = self.conv_before_upsample(x) + x = self.conv_last(self.upsample(x)) + elif self.upsampler == "pixelshuffledirect": + # for lightweight SR + x = self.conv_first(x) + x = self.conv_after_body(self.forward_features(x)) + x + x = self.upsample(x) + elif self.upsampler == "nearest+conv": + # for real-world SR + x = self.conv_first(x) + x = self.conv_after_body(self.forward_features(x)) + x + x = self.conv_before_upsample(x) + x = self.lrelu( + self.conv_up1( + torch.nn.functional.interpolate(x, scale_factor=2, mode="nearest") # type: ignore + ) + ) + if self.upscale == 4: + x = self.lrelu( + self.conv_up2( + torch.nn.functional.interpolate( # type: ignore + x, scale_factor=2, mode="nearest" + ) + ) + ) + elif self.upscale == 8: + x = self.lrelu(self.conv_up2(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest'))) + x = self.lrelu(self.conv_up3(torch.nn.functional.interpolate(x, scale_factor=2, mode='nearest'))) + x = self.conv_last(self.lrelu(self.conv_hr(x))) + else: + # for image denoising and JPEG compression artifact reduction + x_first = self.conv_first(x) + res = self.conv_after_body(self.forward_features(x_first)) + x_first + x = x + self.conv_last(res) + + x = x / self.img_range + self.mean + + return x[:, :, : H * self.upscale, : W * self.upscale] + + def flops(self): + flops = 0 + H, W = self.patches_resolution + flops += H * W * 3 * self.embed_dim * 9 + flops += self.patch_embed.flops() + for i, layer in enumerate(self.layers): + flops += layer.flops() # type: ignore + flops += H * W * 3 * self.embed_dim * self.embed_dim + flops += self.upsample.flops() # type: ignore + return flops diff --git a/ldm_patched/pfn/architecture/__init__.py b/ldm_patched/pfn/architecture/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ldm_patched/pfn/architecture/__pycache__/DAT.cpython-310.pyc b/ldm_patched/pfn/architecture/__pycache__/DAT.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..74ead74f63bb382671e1c6b379c158699b2831b1 Binary files /dev/null and b/ldm_patched/pfn/architecture/__pycache__/DAT.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/__pycache__/HAT.cpython-310.pyc b/ldm_patched/pfn/architecture/__pycache__/HAT.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0301f07b503934b3578e8c6752bc3bd36ef696a8 Binary files /dev/null and b/ldm_patched/pfn/architecture/__pycache__/HAT.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/__pycache__/LaMa.cpython-310.pyc b/ldm_patched/pfn/architecture/__pycache__/LaMa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ec1bf5fe235c0d303448d413c369cdffc4322e4 Binary files /dev/null and b/ldm_patched/pfn/architecture/__pycache__/LaMa.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/__pycache__/RRDB.cpython-310.pyc b/ldm_patched/pfn/architecture/__pycache__/RRDB.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c0ea6c6bba9c36d74b3b36a393473330dd41f1c Binary files /dev/null and b/ldm_patched/pfn/architecture/__pycache__/RRDB.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/__pycache__/SCUNet.cpython-310.pyc b/ldm_patched/pfn/architecture/__pycache__/SCUNet.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c9180426dd9f6bae6ca99131b3b58bdb099b1f57 Binary files /dev/null and b/ldm_patched/pfn/architecture/__pycache__/SCUNet.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/__pycache__/SPSR.cpython-310.pyc b/ldm_patched/pfn/architecture/__pycache__/SPSR.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f799ccee73cf0f2af03e10d7c730df1c9e5514e Binary files /dev/null and b/ldm_patched/pfn/architecture/__pycache__/SPSR.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/__pycache__/SRVGG.cpython-310.pyc b/ldm_patched/pfn/architecture/__pycache__/SRVGG.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..84cb2c031b267c5dd494b3ab0d5b3244459bd315 Binary files /dev/null and b/ldm_patched/pfn/architecture/__pycache__/SRVGG.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/__pycache__/SwiftSRGAN.cpython-310.pyc b/ldm_patched/pfn/architecture/__pycache__/SwiftSRGAN.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9f4c41459667619bec7e9624aeaab02b971c921 Binary files /dev/null and b/ldm_patched/pfn/architecture/__pycache__/SwiftSRGAN.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/__pycache__/Swin2SR.cpython-310.pyc b/ldm_patched/pfn/architecture/__pycache__/Swin2SR.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a72b23f7d1db51a479a99f8c6acf5cf9185361e2 Binary files /dev/null and b/ldm_patched/pfn/architecture/__pycache__/Swin2SR.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/__pycache__/SwinIR.cpython-310.pyc b/ldm_patched/pfn/architecture/__pycache__/SwinIR.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..49222c752bd250bbf118e5372ec354d548316345 Binary files /dev/null and b/ldm_patched/pfn/architecture/__pycache__/SwinIR.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/__pycache__/__init__.cpython-310.pyc b/ldm_patched/pfn/architecture/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a169397d3a92fb03de82e26fc41584fdae2d22f1 Binary files /dev/null and b/ldm_patched/pfn/architecture/__pycache__/__init__.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/__pycache__/block.cpython-310.pyc b/ldm_patched/pfn/architecture/__pycache__/block.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f36ee16473311f2d13c72ca4ee97deee3775ed5b Binary files /dev/null and b/ldm_patched/pfn/architecture/__pycache__/block.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/block.py b/ldm_patched/pfn/architecture/block.py new file mode 100644 index 0000000000000000000000000000000000000000..d7bc5d227008a73c40f9087da1ee3ae2ca25a896 --- /dev/null +++ b/ldm_patched/pfn/architecture/block.py @@ -0,0 +1,546 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from __future__ import annotations + +from collections import OrderedDict +try: + from typing import Literal +except ImportError: + from typing_extensions import Literal + +import torch +import torch.nn as nn + +#################### +# Basic blocks +#################### + + +def act(act_type: str, inplace=True, neg_slope=0.2, n_prelu=1): + # helper selecting activation + # neg_slope: for leakyrelu and init of prelu + # n_prelu: for p_relu num_parameters + act_type = act_type.lower() + if act_type == "relu": + layer = nn.ReLU(inplace) + elif act_type == "leakyrelu": + layer = nn.LeakyReLU(neg_slope, inplace) + elif act_type == "prelu": + layer = nn.PReLU(num_parameters=n_prelu, init=neg_slope) + else: + raise NotImplementedError( + "activation layer [{:s}] is not found".format(act_type) + ) + return layer + + +def norm(norm_type: str, nc: int): + # helper selecting normalization layer + norm_type = norm_type.lower() + if norm_type == "batch": + layer = nn.BatchNorm2d(nc, affine=True) + elif norm_type == "instance": + layer = nn.InstanceNorm2d(nc, affine=False) + else: + raise NotImplementedError( + "normalization layer [{:s}] is not found".format(norm_type) + ) + return layer + + +def pad(pad_type: str, padding): + # helper selecting padding layer + # if padding is 'zero', do by conv layers + pad_type = pad_type.lower() + if padding == 0: + return None + if pad_type == "reflect": + layer = nn.ReflectionPad2d(padding) + elif pad_type == "replicate": + layer = nn.ReplicationPad2d(padding) + else: + raise NotImplementedError( + "padding layer [{:s}] is not implemented".format(pad_type) + ) + return layer + + +def get_valid_padding(kernel_size, dilation): + kernel_size = kernel_size + (kernel_size - 1) * (dilation - 1) + padding = (kernel_size - 1) // 2 + return padding + + +class ConcatBlock(nn.Module): + # Concat the output of a submodule to its input + def __init__(self, submodule): + super(ConcatBlock, self).__init__() + self.sub = submodule + + def forward(self, x): + output = torch.cat((x, self.sub(x)), dim=1) + return output + + def __repr__(self): + tmpstr = "Identity .. \n|" + modstr = self.sub.__repr__().replace("\n", "\n|") + tmpstr = tmpstr + modstr + return tmpstr + + +class ShortcutBlock(nn.Module): + # Elementwise sum the output of a submodule to its input + def __init__(self, submodule): + super(ShortcutBlock, self).__init__() + self.sub = submodule + + def forward(self, x): + output = x + self.sub(x) + return output + + def __repr__(self): + tmpstr = "Identity + \n|" + modstr = self.sub.__repr__().replace("\n", "\n|") + tmpstr = tmpstr + modstr + return tmpstr + + +class ShortcutBlockSPSR(nn.Module): + # Elementwise sum the output of a submodule to its input + def __init__(self, submodule): + super(ShortcutBlockSPSR, self).__init__() + self.sub = submodule + + def forward(self, x): + return x, self.sub + + def __repr__(self): + tmpstr = "Identity + \n|" + modstr = self.sub.__repr__().replace("\n", "\n|") + tmpstr = tmpstr + modstr + return tmpstr + + +def sequential(*args): + # Flatten Sequential. It unwraps nn.Sequential. + if len(args) == 1: + if isinstance(args[0], OrderedDict): + raise NotImplementedError("sequential does not support OrderedDict input.") + return args[0] # No sequential is needed. + modules = [] + for module in args: + if isinstance(module, nn.Sequential): + for submodule in module.children(): + modules.append(submodule) + elif isinstance(module, nn.Module): + modules.append(module) + return nn.Sequential(*modules) + + +ConvMode = Literal["CNA", "NAC", "CNAC"] + + +# 2x2x2 Conv Block +def conv_block_2c2( + in_nc, + out_nc, + act_type="relu", +): + return sequential( + nn.Conv2d(in_nc, out_nc, kernel_size=2, padding=1), + nn.Conv2d(out_nc, out_nc, kernel_size=2, padding=0), + act(act_type) if act_type else None, + ) + + +def conv_block( + in_nc: int, + out_nc: int, + kernel_size, + stride=1, + dilation=1, + groups=1, + bias=True, + pad_type="zero", + norm_type: str | None = None, + act_type: str | None = "relu", + mode: ConvMode = "CNA", + c2x2=False, +): + """ + Conv layer with padding, normalization, activation + mode: CNA --> Conv -> Norm -> Act + NAC --> Norm -> Act --> Conv (Identity Mappings in Deep Residual Networks, ECCV16) + """ + + if c2x2: + return conv_block_2c2(in_nc, out_nc, act_type=act_type) + + assert mode in ("CNA", "NAC", "CNAC"), "Wrong conv mode [{:s}]".format(mode) + padding = get_valid_padding(kernel_size, dilation) + p = pad(pad_type, padding) if pad_type and pad_type != "zero" else None + padding = padding if pad_type == "zero" else 0 + + c = nn.Conv2d( + in_nc, + out_nc, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=bias, + groups=groups, + ) + a = act(act_type) if act_type else None + if mode in ("CNA", "CNAC"): + n = norm(norm_type, out_nc) if norm_type else None + return sequential(p, c, n, a) + elif mode == "NAC": + if norm_type is None and act_type is not None: + a = act(act_type, inplace=False) + # Important! + # input----ReLU(inplace)----Conv--+----output + # |________________________| + # inplace ReLU will modify the input, therefore wrong output + n = norm(norm_type, in_nc) if norm_type else None + return sequential(n, a, p, c) + else: + assert False, f"Invalid conv mode {mode}" + + +#################### +# Useful blocks +#################### + + +class ResNetBlock(nn.Module): + """ + ResNet Block, 3-3 style + with extra residual scaling used in EDSR + (Enhanced Deep Residual Networks for Single Image Super-Resolution, CVPRW 17) + """ + + def __init__( + self, + in_nc, + mid_nc, + out_nc, + kernel_size=3, + stride=1, + dilation=1, + groups=1, + bias=True, + pad_type="zero", + norm_type=None, + act_type="relu", + mode: ConvMode = "CNA", + res_scale=1, + ): + super(ResNetBlock, self).__init__() + conv0 = conv_block( + in_nc, + mid_nc, + kernel_size, + stride, + dilation, + groups, + bias, + pad_type, + norm_type, + act_type, + mode, + ) + if mode == "CNA": + act_type = None + if mode == "CNAC": # Residual path: |-CNAC-| + act_type = None + norm_type = None + conv1 = conv_block( + mid_nc, + out_nc, + kernel_size, + stride, + dilation, + groups, + bias, + pad_type, + norm_type, + act_type, + mode, + ) + # if in_nc != out_nc: + # self.project = conv_block(in_nc, out_nc, 1, stride, dilation, 1, bias, pad_type, \ + # None, None) + # print('Need a projecter in ResNetBlock.') + # else: + # self.project = lambda x:x + self.res = sequential(conv0, conv1) + self.res_scale = res_scale + + def forward(self, x): + res = self.res(x).mul(self.res_scale) + return x + res + + +class RRDB(nn.Module): + """ + Residual in Residual Dense Block + (ESRGAN: Enhanced Super-Resolution Generative Adversarial Networks) + """ + + def __init__( + self, + nf, + kernel_size=3, + gc=32, + stride=1, + bias: bool = True, + pad_type="zero", + norm_type=None, + act_type="leakyrelu", + mode: ConvMode = "CNA", + _convtype="Conv2D", + _spectral_norm=False, + plus=False, + c2x2=False, + ): + super(RRDB, self).__init__() + self.RDB1 = ResidualDenseBlock_5C( + nf, + kernel_size, + gc, + stride, + bias, + pad_type, + norm_type, + act_type, + mode, + plus=plus, + c2x2=c2x2, + ) + self.RDB2 = ResidualDenseBlock_5C( + nf, + kernel_size, + gc, + stride, + bias, + pad_type, + norm_type, + act_type, + mode, + plus=plus, + c2x2=c2x2, + ) + self.RDB3 = ResidualDenseBlock_5C( + nf, + kernel_size, + gc, + stride, + bias, + pad_type, + norm_type, + act_type, + mode, + plus=plus, + c2x2=c2x2, + ) + + def forward(self, x): + out = self.RDB1(x) + out = self.RDB2(out) + out = self.RDB3(out) + return out * 0.2 + x + + +class ResidualDenseBlock_5C(nn.Module): + """ + Residual Dense Block + style: 5 convs + The core module of paper: (Residual Dense Network for Image Super-Resolution, CVPR 18) + Modified options that can be used: + - "Partial Convolution based Padding" arXiv:1811.11718 + - "Spectral normalization" arXiv:1802.05957 + - "ICASSP 2020 - ESRGAN+ : Further Improving ESRGAN" N. C. + {Rakotonirina} and A. {Rasoanaivo} + + Args: + nf (int): Channel number of intermediate features (num_feat). + gc (int): Channels for each growth (num_grow_ch: growth channel, + i.e. intermediate channels). + convtype (str): the type of convolution to use. Default: 'Conv2D' + gaussian_noise (bool): enable the ESRGAN+ gaussian noise (no new + trainable parameters) + plus (bool): enable the additional residual paths from ESRGAN+ + (adds trainable parameters) + """ + + def __init__( + self, + nf=64, + kernel_size=3, + gc=32, + stride=1, + bias: bool = True, + pad_type="zero", + norm_type=None, + act_type="leakyrelu", + mode: ConvMode = "CNA", + plus=False, + c2x2=False, + ): + super(ResidualDenseBlock_5C, self).__init__() + + ## + + self.conv1x1 = conv1x1(nf, gc) if plus else None + ## + + + self.conv1 = conv_block( + nf, + gc, + kernel_size, + stride, + bias=bias, + pad_type=pad_type, + norm_type=norm_type, + act_type=act_type, + mode=mode, + c2x2=c2x2, + ) + self.conv2 = conv_block( + nf + gc, + gc, + kernel_size, + stride, + bias=bias, + pad_type=pad_type, + norm_type=norm_type, + act_type=act_type, + mode=mode, + c2x2=c2x2, + ) + self.conv3 = conv_block( + nf + 2 * gc, + gc, + kernel_size, + stride, + bias=bias, + pad_type=pad_type, + norm_type=norm_type, + act_type=act_type, + mode=mode, + c2x2=c2x2, + ) + self.conv4 = conv_block( + nf + 3 * gc, + gc, + kernel_size, + stride, + bias=bias, + pad_type=pad_type, + norm_type=norm_type, + act_type=act_type, + mode=mode, + c2x2=c2x2, + ) + if mode == "CNA": + last_act = None + else: + last_act = act_type + self.conv5 = conv_block( + nf + 4 * gc, + nf, + 3, + stride, + bias=bias, + pad_type=pad_type, + norm_type=norm_type, + act_type=last_act, + mode=mode, + c2x2=c2x2, + ) + + def forward(self, x): + x1 = self.conv1(x) + x2 = self.conv2(torch.cat((x, x1), 1)) + if self.conv1x1: + # pylint: disable=not-callable + x2 = x2 + self.conv1x1(x) # + + x3 = self.conv3(torch.cat((x, x1, x2), 1)) + x4 = self.conv4(torch.cat((x, x1, x2, x3), 1)) + if self.conv1x1: + x4 = x4 + x2 # + + x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1)) + return x5 * 0.2 + x + + +def conv1x1(in_planes, out_planes, stride=1): + return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) + + +#################### +# Upsampler +#################### + + +def pixelshuffle_block( + in_nc: int, + out_nc: int, + upscale_factor=2, + kernel_size=3, + stride=1, + bias=True, + pad_type="zero", + norm_type: str | None = None, + act_type="relu", +): + """ + Pixel shuffle layer + (Real-Time Single Image and Video Super-Resolution Using an Efficient Sub-Pixel Convolutional + Neural Network, CVPR17) + """ + conv = conv_block( + in_nc, + out_nc * (upscale_factor**2), + kernel_size, + stride, + bias=bias, + pad_type=pad_type, + norm_type=None, + act_type=None, + ) + pixel_shuffle = nn.PixelShuffle(upscale_factor) + + n = norm(norm_type, out_nc) if norm_type else None + a = act(act_type) if act_type else None + return sequential(conv, pixel_shuffle, n, a) + + +def upconv_block( + in_nc: int, + out_nc: int, + upscale_factor=2, + kernel_size=3, + stride=1, + bias=True, + pad_type="zero", + norm_type: str | None = None, + act_type="relu", + mode="nearest", + c2x2=False, +): + # Up conv + # described in https://distill.pub/2016/deconv-checkerboard/ + upsample = nn.Upsample(scale_factor=upscale_factor, mode=mode) + conv = conv_block( + in_nc, + out_nc, + kernel_size, + stride, + bias=bias, + pad_type=pad_type, + norm_type=norm_type, + act_type=act_type, + c2x2=c2x2, + ) + return sequential(upsample, conv) diff --git a/ldm_patched/pfn/architecture/face/LICENSE-GFPGAN b/ldm_patched/pfn/architecture/face/LICENSE-GFPGAN new file mode 100644 index 0000000000000000000000000000000000000000..5ac273fd509e328f396e6e4444673a3b051a4968 --- /dev/null +++ b/ldm_patched/pfn/architecture/face/LICENSE-GFPGAN @@ -0,0 +1,351 @@ +Tencent is pleased to support the open source community by making GFPGAN available. + +Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. + +GFPGAN is licensed under the Apache License Version 2.0 except for the third-party components listed below. + + +Terms of the Apache License Version 2.0: +--------------------------------------------- +Apache License + +Version 2.0, January 2004 + +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION +1. Definitions. + +“License” shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. + +“Licensor” shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. + +“Legal Entity” shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, “control” means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + +“You” (or “Your”) shall mean an individual or Legal Entity exercising permissions granted by this License. + +“Source” form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. + +“Object” form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. + +“Work” shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). + +“Derivative Works” shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. + +“Contribution” shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, “submitted” means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as “Not a Contribution.” + +“Contributor” shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of this License; and + +You must cause any modified files to carry prominent notices stating that You changed the files; and + +You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and + +If the Work includes a “NOTICE” text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. + +You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + + + +Other dependencies and licenses: + + +Open Source Software licensed under the Apache 2.0 license and Other Licenses of the Third-Party Components therein: +--------------------------------------------- +1. basicsr +Copyright 2018-2020 BasicSR Authors + + +This BasicSR project is released under the Apache 2.0 license. + +A copy of Apache 2.0 is included in this file. + +StyleGAN2 +The codes are modified from the repository stylegan2-pytorch. Many thanks to the author - Kim Seonghyeon 😊 for translating from the official TensorFlow codes to PyTorch ones. Here is the license of stylegan2-pytorch. +The official repository is https://github.com/NVlabs/stylegan2, and here is the NVIDIA license. +DFDNet +The codes are largely modified from the repository DFDNet. Their license is Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. + +Terms of the Nvidia License: +--------------------------------------------- + +1. Definitions + +"Licensor" means any person or entity that distributes its Work. + +"Software" means the original work of authorship made available under +this License. + +"Work" means the Software and any additions to or derivative works of +the Software that are made available under this License. + +"Nvidia Processors" means any central processing unit (CPU), graphics +processing unit (GPU), field-programmable gate array (FPGA), +application-specific integrated circuit (ASIC) or any combination +thereof designed, made, sold, or provided by Nvidia or its affiliates. + +The terms "reproduce," "reproduction," "derivative works," and +"distribution" have the meaning as provided under U.S. copyright law; +provided, however, that for the purposes of this License, derivative +works shall not include works that remain separable from, or merely +link (or bind by name) to the interfaces of, the Work. + +Works, including the Software, are "made available" under this License +by including in or with the Work either (a) a copyright notice +referencing the applicability of this License to the Work, or (b) a +copy of this License. + +2. License Grants + + 2.1 Copyright Grant. Subject to the terms and conditions of this + License, each Licensor grants to you a perpetual, worldwide, + non-exclusive, royalty-free, copyright license to reproduce, + prepare derivative works of, publicly display, publicly perform, + sublicense and distribute its Work and any resulting derivative + works in any form. + +3. Limitations + + 3.1 Redistribution. You may reproduce or distribute the Work only + if (a) you do so under this License, (b) you include a complete + copy of this License with your distribution, and (c) you retain + without modification any copyright, patent, trademark, or + attribution notices that are present in the Work. + + 3.2 Derivative Works. You may specify that additional or different + terms apply to the use, reproduction, and distribution of your + derivative works of the Work ("Your Terms") only if (a) Your Terms + provide that the use limitation in Section 3.3 applies to your + derivative works, and (b) you identify the specific derivative + works that are subject to Your Terms. Notwithstanding Your Terms, + this License (including the redistribution requirements in Section + 3.1) will continue to apply to the Work itself. + + 3.3 Use Limitation. The Work and any derivative works thereof only + may be used or intended for use non-commercially. The Work or + derivative works thereof may be used or intended for use by Nvidia + or its affiliates commercially or non-commercially. As used herein, + "non-commercially" means for research or evaluation purposes only. + + 3.4 Patent Claims. If you bring or threaten to bring a patent claim + against any Licensor (including any claim, cross-claim or + counterclaim in a lawsuit) to enforce any patents that you allege + are infringed by any Work, then your rights under this License from + such Licensor (including the grants in Sections 2.1 and 2.2) will + terminate immediately. + + 3.5 Trademarks. This License does not grant any rights to use any + Licensor's or its affiliates' names, logos, or trademarks, except + as necessary to reproduce the notices described in this License. + + 3.6 Termination. If you violate any term of this License, then your + rights under this License (including the grants in Sections 2.1 and + 2.2) will terminate immediately. + +4. Disclaimer of Warranty. + +THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR +NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER +THIS LICENSE. + +5. Limitation of Liability. + +EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL +THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE +SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, +INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF +OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK +(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, +LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER +COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF +THE POSSIBILITY OF SUCH DAMAGES. + +MIT License + +Copyright (c) 2019 Kim Seonghyeon + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + + +Open Source Software licensed under the BSD 3-Clause license: +--------------------------------------------- +1. torchvision +Copyright (c) Soumith Chintala 2016, +All rights reserved. + +2. torch +Copyright (c) 2016- Facebook, Inc (Adam Paszke) +Copyright (c) 2014- Facebook, Inc (Soumith Chintala) +Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) +Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) +Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) +Copyright (c) 2011-2013 NYU (Clement Farabet) +Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) +Copyright (c) 2006 Idiap Research Institute (Samy Bengio) +Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) + + +Terms of the BSD 3-Clause License: +--------------------------------------------- +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + +Open Source Software licensed under the BSD 3-Clause License and Other Licenses of the Third-Party Components therein: +--------------------------------------------- +1. numpy +Copyright (c) 2005-2020, NumPy Developers. +All rights reserved. + +A copy of BSD 3-Clause License is included in this file. + +The NumPy repository and source distributions bundle several libraries that are +compatibly licensed. We list these here. + +Name: Numpydoc +Files: doc/sphinxext/numpydoc/* +License: BSD-2-Clause + For details, see doc/sphinxext/LICENSE.txt + +Name: scipy-sphinx-theme +Files: doc/scipy-sphinx-theme/* +License: BSD-3-Clause AND PSF-2.0 AND Apache-2.0 + For details, see doc/scipy-sphinx-theme/LICENSE.txt + +Name: lapack-lite +Files: numpy/linalg/lapack_lite/* +License: BSD-3-Clause + For details, see numpy/linalg/lapack_lite/LICENSE.txt + +Name: tempita +Files: tools/npy_tempita/* +License: MIT + For details, see tools/npy_tempita/license.txt + +Name: dragon4 +Files: numpy/core/src/multiarray/dragon4.c +License: MIT + For license text, see numpy/core/src/multiarray/dragon4.c + + + +Open Source Software licensed under the MIT license: +--------------------------------------------- +1. facexlib +Copyright (c) 2020 Xintao Wang + +2. opencv-python +Copyright (c) Olli-Pekka Heinisuo +Please note that only files in cv2 package are used. + + +Terms of the MIT License: +--------------------------------------------- +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + + +Open Source Software licensed under the MIT license and Other Licenses of the Third-Party Components therein: +--------------------------------------------- +1. tqdm +Copyright (c) 2013 noamraph + +`tqdm` is a product of collaborative work. +Unless otherwise stated, all authors (see commit logs) retain copyright +for their respective work, and release the work under the MIT licence +(text below). + +Exceptions or notable authors are listed below +in reverse chronological order: + +* files: * + MPLv2.0 2015-2020 (c) Casper da Costa-Luis + [casperdcl](https://github.com/casperdcl). +* files: tqdm/_tqdm.py + MIT 2016 (c) [PR #96] on behalf of Google Inc. +* files: tqdm/_tqdm.py setup.py README.rst MANIFEST.in .gitignore + MIT 2013 (c) Noam Yorav-Raphael, original author. + +[PR #96]: https://github.com/tqdm/tqdm/pull/96 + + +Mozilla Public Licence (MPL) v. 2.0 - Exhibit A +----------------------------------------------- + +This Source Code Form is subject to the terms of the +Mozilla Public License, v. 2.0. +If a copy of the MPL was not distributed with this file, +You can obtain one at https://mozilla.org/MPL/2.0/. + + +MIT License (MIT) +----------------- + +Copyright (c) 2013 noamraph + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/ldm_patched/pfn/architecture/face/LICENSE-RestoreFormer b/ldm_patched/pfn/architecture/face/LICENSE-RestoreFormer new file mode 100644 index 0000000000000000000000000000000000000000..5ac273fd509e328f396e6e4444673a3b051a4968 --- /dev/null +++ b/ldm_patched/pfn/architecture/face/LICENSE-RestoreFormer @@ -0,0 +1,351 @@ +Tencent is pleased to support the open source community by making GFPGAN available. + +Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. + +GFPGAN is licensed under the Apache License Version 2.0 except for the third-party components listed below. + + +Terms of the Apache License Version 2.0: +--------------------------------------------- +Apache License + +Version 2.0, January 2004 + +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION +1. Definitions. + +“License” shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. + +“Licensor” shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. + +“Legal Entity” shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, “control” means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. + +“You” (or “Your”) shall mean an individual or Legal Entity exercising permissions granted by this License. + +“Source” form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. + +“Object” form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. + +“Work” shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). + +“Derivative Works” shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. + +“Contribution” shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, “submitted” means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as “Not a Contribution.” + +“Contributor” shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of this License; and + +You must cause any modified files to carry prominent notices stating that You changed the files; and + +You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and + +If the Work includes a “NOTICE” text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. + +You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + + + +Other dependencies and licenses: + + +Open Source Software licensed under the Apache 2.0 license and Other Licenses of the Third-Party Components therein: +--------------------------------------------- +1. basicsr +Copyright 2018-2020 BasicSR Authors + + +This BasicSR project is released under the Apache 2.0 license. + +A copy of Apache 2.0 is included in this file. + +StyleGAN2 +The codes are modified from the repository stylegan2-pytorch. Many thanks to the author - Kim Seonghyeon 😊 for translating from the official TensorFlow codes to PyTorch ones. Here is the license of stylegan2-pytorch. +The official repository is https://github.com/NVlabs/stylegan2, and here is the NVIDIA license. +DFDNet +The codes are largely modified from the repository DFDNet. Their license is Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. + +Terms of the Nvidia License: +--------------------------------------------- + +1. Definitions + +"Licensor" means any person or entity that distributes its Work. + +"Software" means the original work of authorship made available under +this License. + +"Work" means the Software and any additions to or derivative works of +the Software that are made available under this License. + +"Nvidia Processors" means any central processing unit (CPU), graphics +processing unit (GPU), field-programmable gate array (FPGA), +application-specific integrated circuit (ASIC) or any combination +thereof designed, made, sold, or provided by Nvidia or its affiliates. + +The terms "reproduce," "reproduction," "derivative works," and +"distribution" have the meaning as provided under U.S. copyright law; +provided, however, that for the purposes of this License, derivative +works shall not include works that remain separable from, or merely +link (or bind by name) to the interfaces of, the Work. + +Works, including the Software, are "made available" under this License +by including in or with the Work either (a) a copyright notice +referencing the applicability of this License to the Work, or (b) a +copy of this License. + +2. License Grants + + 2.1 Copyright Grant. Subject to the terms and conditions of this + License, each Licensor grants to you a perpetual, worldwide, + non-exclusive, royalty-free, copyright license to reproduce, + prepare derivative works of, publicly display, publicly perform, + sublicense and distribute its Work and any resulting derivative + works in any form. + +3. Limitations + + 3.1 Redistribution. You may reproduce or distribute the Work only + if (a) you do so under this License, (b) you include a complete + copy of this License with your distribution, and (c) you retain + without modification any copyright, patent, trademark, or + attribution notices that are present in the Work. + + 3.2 Derivative Works. You may specify that additional or different + terms apply to the use, reproduction, and distribution of your + derivative works of the Work ("Your Terms") only if (a) Your Terms + provide that the use limitation in Section 3.3 applies to your + derivative works, and (b) you identify the specific derivative + works that are subject to Your Terms. Notwithstanding Your Terms, + this License (including the redistribution requirements in Section + 3.1) will continue to apply to the Work itself. + + 3.3 Use Limitation. The Work and any derivative works thereof only + may be used or intended for use non-commercially. The Work or + derivative works thereof may be used or intended for use by Nvidia + or its affiliates commercially or non-commercially. As used herein, + "non-commercially" means for research or evaluation purposes only. + + 3.4 Patent Claims. If you bring or threaten to bring a patent claim + against any Licensor (including any claim, cross-claim or + counterclaim in a lawsuit) to enforce any patents that you allege + are infringed by any Work, then your rights under this License from + such Licensor (including the grants in Sections 2.1 and 2.2) will + terminate immediately. + + 3.5 Trademarks. This License does not grant any rights to use any + Licensor's or its affiliates' names, logos, or trademarks, except + as necessary to reproduce the notices described in this License. + + 3.6 Termination. If you violate any term of this License, then your + rights under this License (including the grants in Sections 2.1 and + 2.2) will terminate immediately. + +4. Disclaimer of Warranty. + +THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR +NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER +THIS LICENSE. + +5. Limitation of Liability. + +EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL +THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE +SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, +INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF +OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK +(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, +LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER +COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF +THE POSSIBILITY OF SUCH DAMAGES. + +MIT License + +Copyright (c) 2019 Kim Seonghyeon + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + + + +Open Source Software licensed under the BSD 3-Clause license: +--------------------------------------------- +1. torchvision +Copyright (c) Soumith Chintala 2016, +All rights reserved. + +2. torch +Copyright (c) 2016- Facebook, Inc (Adam Paszke) +Copyright (c) 2014- Facebook, Inc (Soumith Chintala) +Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) +Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) +Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) +Copyright (c) 2011-2013 NYU (Clement Farabet) +Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) +Copyright (c) 2006 Idiap Research Institute (Samy Bengio) +Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) + + +Terms of the BSD 3-Clause License: +--------------------------------------------- +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + +Open Source Software licensed under the BSD 3-Clause License and Other Licenses of the Third-Party Components therein: +--------------------------------------------- +1. numpy +Copyright (c) 2005-2020, NumPy Developers. +All rights reserved. + +A copy of BSD 3-Clause License is included in this file. + +The NumPy repository and source distributions bundle several libraries that are +compatibly licensed. We list these here. + +Name: Numpydoc +Files: doc/sphinxext/numpydoc/* +License: BSD-2-Clause + For details, see doc/sphinxext/LICENSE.txt + +Name: scipy-sphinx-theme +Files: doc/scipy-sphinx-theme/* +License: BSD-3-Clause AND PSF-2.0 AND Apache-2.0 + For details, see doc/scipy-sphinx-theme/LICENSE.txt + +Name: lapack-lite +Files: numpy/linalg/lapack_lite/* +License: BSD-3-Clause + For details, see numpy/linalg/lapack_lite/LICENSE.txt + +Name: tempita +Files: tools/npy_tempita/* +License: MIT + For details, see tools/npy_tempita/license.txt + +Name: dragon4 +Files: numpy/core/src/multiarray/dragon4.c +License: MIT + For license text, see numpy/core/src/multiarray/dragon4.c + + + +Open Source Software licensed under the MIT license: +--------------------------------------------- +1. facexlib +Copyright (c) 2020 Xintao Wang + +2. opencv-python +Copyright (c) Olli-Pekka Heinisuo +Please note that only files in cv2 package are used. + + +Terms of the MIT License: +--------------------------------------------- +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + + +Open Source Software licensed under the MIT license and Other Licenses of the Third-Party Components therein: +--------------------------------------------- +1. tqdm +Copyright (c) 2013 noamraph + +`tqdm` is a product of collaborative work. +Unless otherwise stated, all authors (see commit logs) retain copyright +for their respective work, and release the work under the MIT licence +(text below). + +Exceptions or notable authors are listed below +in reverse chronological order: + +* files: * + MPLv2.0 2015-2020 (c) Casper da Costa-Luis + [casperdcl](https://github.com/casperdcl). +* files: tqdm/_tqdm.py + MIT 2016 (c) [PR #96] on behalf of Google Inc. +* files: tqdm/_tqdm.py setup.py README.rst MANIFEST.in .gitignore + MIT 2013 (c) Noam Yorav-Raphael, original author. + +[PR #96]: https://github.com/tqdm/tqdm/pull/96 + + +Mozilla Public Licence (MPL) v. 2.0 - Exhibit A +----------------------------------------------- + +This Source Code Form is subject to the terms of the +Mozilla Public License, v. 2.0. +If a copy of the MPL was not distributed with this file, +You can obtain one at https://mozilla.org/MPL/2.0/. + + +MIT License (MIT) +----------------- + +Copyright (c) 2013 noamraph + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/ldm_patched/pfn/architecture/face/LICENSE-codeformer b/ldm_patched/pfn/architecture/face/LICENSE-codeformer new file mode 100644 index 0000000000000000000000000000000000000000..be6c4ed8048a7cb436376bbea84cb0bd726ab721 --- /dev/null +++ b/ldm_patched/pfn/architecture/face/LICENSE-codeformer @@ -0,0 +1,35 @@ +S-Lab License 1.0 + +Copyright 2022 S-Lab + +Redistribution and use for non-commercial purpose in source and +binary forms, with or without modification, are permitted provided +that the following conditions are met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +In the event that redistribution and/or use for commercial purpose in +source or binary forms, with or without modification is required, +please contact the contributor(s) of the work. diff --git a/ldm_patched/pfn/architecture/face/__pycache__/codeformer.cpython-310.pyc b/ldm_patched/pfn/architecture/face/__pycache__/codeformer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bd76c822edff51e5ff4b03e3a9636be5a093cf80 Binary files /dev/null and b/ldm_patched/pfn/architecture/face/__pycache__/codeformer.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/face/__pycache__/gfpganv1_clean_arch.cpython-310.pyc b/ldm_patched/pfn/architecture/face/__pycache__/gfpganv1_clean_arch.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4976a68c9d5a5ccc6241c95f2f89ef79e93f2e90 Binary files /dev/null and b/ldm_patched/pfn/architecture/face/__pycache__/gfpganv1_clean_arch.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/face/__pycache__/restoreformer_arch.cpython-310.pyc b/ldm_patched/pfn/architecture/face/__pycache__/restoreformer_arch.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..43ec0bec6e524e6913bbcc53ab9fdeb7921d8ca8 Binary files /dev/null and b/ldm_patched/pfn/architecture/face/__pycache__/restoreformer_arch.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/face/__pycache__/stylegan2_clean_arch.cpython-310.pyc b/ldm_patched/pfn/architecture/face/__pycache__/stylegan2_clean_arch.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..373a35fe3a486d2cad94b445c6004a3d806f6b2f Binary files /dev/null and b/ldm_patched/pfn/architecture/face/__pycache__/stylegan2_clean_arch.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/face/arcface_arch.py b/ldm_patched/pfn/architecture/face/arcface_arch.py new file mode 100644 index 0000000000000000000000000000000000000000..b548af059a71b38c6c18cd35cbfed7bae7e55441 --- /dev/null +++ b/ldm_patched/pfn/architecture/face/arcface_arch.py @@ -0,0 +1,265 @@ +import torch.nn as nn + + +def conv3x3(inplanes, outplanes, stride=1): + """A simple wrapper for 3x3 convolution with padding. + + Args: + inplanes (int): Channel number of inputs. + outplanes (int): Channel number of outputs. + stride (int): Stride in convolution. Default: 1. + """ + return nn.Conv2d( + inplanes, outplanes, kernel_size=3, stride=stride, padding=1, bias=False + ) + + +class BasicBlock(nn.Module): + """Basic residual block used in the ResNetArcFace architecture. + + Args: + inplanes (int): Channel number of inputs. + planes (int): Channel number of outputs. + stride (int): Stride in convolution. Default: 1. + downsample (nn.Module): The downsample module. Default: None. + """ + + expansion = 1 # output channel expansion ratio + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class IRBlock(nn.Module): + """Improved residual block (IR Block) used in the ResNetArcFace architecture. + + Args: + inplanes (int): Channel number of inputs. + planes (int): Channel number of outputs. + stride (int): Stride in convolution. Default: 1. + downsample (nn.Module): The downsample module. Default: None. + use_se (bool): Whether use the SEBlock (squeeze and excitation block). Default: True. + """ + + expansion = 1 # output channel expansion ratio + + def __init__(self, inplanes, planes, stride=1, downsample=None, use_se=True): + super(IRBlock, self).__init__() + self.bn0 = nn.BatchNorm2d(inplanes) + self.conv1 = conv3x3(inplanes, inplanes) + self.bn1 = nn.BatchNorm2d(inplanes) + self.prelu = nn.PReLU() + self.conv2 = conv3x3(inplanes, planes, stride) + self.bn2 = nn.BatchNorm2d(planes) + self.downsample = downsample + self.stride = stride + self.use_se = use_se + if self.use_se: + self.se = SEBlock(planes) + + def forward(self, x): + residual = x + out = self.bn0(x) + out = self.conv1(out) + out = self.bn1(out) + out = self.prelu(out) + + out = self.conv2(out) + out = self.bn2(out) + if self.use_se: + out = self.se(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.prelu(out) + + return out + + +class Bottleneck(nn.Module): + """Bottleneck block used in the ResNetArcFace architecture. + + Args: + inplanes (int): Channel number of inputs. + planes (int): Channel number of outputs. + stride (int): Stride in convolution. Default: 1. + downsample (nn.Module): The downsample module. Default: None. + """ + + expansion = 4 # output channel expansion ratio + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d( + planes, planes, kernel_size=3, stride=stride, padding=1, bias=False + ) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d( + planes, planes * self.expansion, kernel_size=1, bias=False + ) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class SEBlock(nn.Module): + """The squeeze-and-excitation block (SEBlock) used in the IRBlock. + + Args: + channel (int): Channel number of inputs. + reduction (int): Channel reduction ration. Default: 16. + """ + + def __init__(self, channel, reduction=16): + super(SEBlock, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d( + 1 + ) # pool to 1x1 without spatial information + self.fc = nn.Sequential( + nn.Linear(channel, channel // reduction), + nn.PReLU(), + nn.Linear(channel // reduction, channel), + nn.Sigmoid(), + ) + + def forward(self, x): + b, c, _, _ = x.size() + y = self.avg_pool(x).view(b, c) + y = self.fc(y).view(b, c, 1, 1) + return x * y + + +class ResNetArcFace(nn.Module): + """ArcFace with ResNet architectures. + + Ref: ArcFace: Additive Angular Margin Loss for Deep Face Recognition. + + Args: + block (str): Block used in the ArcFace architecture. + layers (tuple(int)): Block numbers in each layer. + use_se (bool): Whether use the SEBlock (squeeze and excitation block). Default: True. + """ + + def __init__(self, block, layers, use_se=True): + if block == "IRBlock": + block = IRBlock + self.inplanes = 64 + self.use_se = use_se + super(ResNetArcFace, self).__init__() + + self.conv1 = nn.Conv2d(1, 64, kernel_size=3, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.prelu = nn.PReLU() + self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + self.bn4 = nn.BatchNorm2d(512) + self.dropout = nn.Dropout() + self.fc5 = nn.Linear(512 * 8 * 8, 512) + self.bn5 = nn.BatchNorm1d(512) + + # initialization + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.xavier_normal_(m.weight) + elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.xavier_normal_(m.weight) + nn.init.constant_(m.bias, 0) + + def _make_layer(self, block, planes, num_blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False, + ), + nn.BatchNorm2d(planes * block.expansion), + ) + layers = [] + layers.append( + block(self.inplanes, planes, stride, downsample, use_se=self.use_se) + ) + self.inplanes = planes + for _ in range(1, num_blocks): + layers.append(block(self.inplanes, planes, use_se=self.use_se)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.prelu(x) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = self.bn4(x) + x = self.dropout(x) + x = x.view(x.size(0), -1) + x = self.fc5(x) + x = self.bn5(x) + + return x diff --git a/ldm_patched/pfn/architecture/face/codeformer.py b/ldm_patched/pfn/architecture/face/codeformer.py new file mode 100644 index 0000000000000000000000000000000000000000..a0e2e985e8f4487547779574c6b210da412a8e71 --- /dev/null +++ b/ldm_patched/pfn/architecture/face/codeformer.py @@ -0,0 +1,790 @@ +""" +Modified from https://github.com/sczhou/CodeFormer +VQGAN code, adapted from the original created by the Unleashing Transformers authors: +https://github.com/samb-t/unleashing-transformers/blob/master/models/vqgan.py +This version of the arch specifically was gathered from an old version of GFPGAN. If this is a problem, please contact me. +""" +import math +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +import logging as logger +from torch import Tensor + + +class VectorQuantizer(nn.Module): + def __init__(self, codebook_size, emb_dim, beta): + super(VectorQuantizer, self).__init__() + self.codebook_size = codebook_size # number of embeddings + self.emb_dim = emb_dim # dimension of embedding + self.beta = beta # commitment cost used in loss term, beta * ||z_e(x)-sg[e]||^2 + self.embedding = nn.Embedding(self.codebook_size, self.emb_dim) + self.embedding.weight.data.uniform_( + -1.0 / self.codebook_size, 1.0 / self.codebook_size + ) + + def forward(self, z): + # reshape z -> (batch, height, width, channel) and flatten + z = z.permute(0, 2, 3, 1).contiguous() + z_flattened = z.view(-1, self.emb_dim) + + # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z + d = ( + (z_flattened**2).sum(dim=1, keepdim=True) + + (self.embedding.weight**2).sum(1) + - 2 * torch.matmul(z_flattened, self.embedding.weight.t()) + ) + + mean_distance = torch.mean(d) + # find closest encodings + # min_encoding_indices = torch.argmin(d, dim=1).unsqueeze(1) + min_encoding_scores, min_encoding_indices = torch.topk( + d, 1, dim=1, largest=False + ) + # [0-1], higher score, higher confidence + min_encoding_scores = torch.exp(-min_encoding_scores / 10) + + min_encodings = torch.zeros( + min_encoding_indices.shape[0], self.codebook_size + ).to(z) + min_encodings.scatter_(1, min_encoding_indices, 1) + + # get quantized latent vectors + z_q = torch.matmul(min_encodings, self.embedding.weight).view(z.shape) + # compute loss for embedding + loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean( + (z_q - z.detach()) ** 2 + ) + # preserve gradients + z_q = z + (z_q - z).detach() + + # perplexity + e_mean = torch.mean(min_encodings, dim=0) + perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + 1e-10))) + # reshape back to match original input shape + z_q = z_q.permute(0, 3, 1, 2).contiguous() + + return ( + z_q, + loss, + { + "perplexity": perplexity, + "min_encodings": min_encodings, + "min_encoding_indices": min_encoding_indices, + "min_encoding_scores": min_encoding_scores, + "mean_distance": mean_distance, + }, + ) + + def get_codebook_feat(self, indices, shape): + # input indices: batch*token_num -> (batch*token_num)*1 + # shape: batch, height, width, channel + indices = indices.view(-1, 1) + min_encodings = torch.zeros(indices.shape[0], self.codebook_size).to(indices) + min_encodings.scatter_(1, indices, 1) + # get quantized latent vectors + z_q = torch.matmul(min_encodings.float(), self.embedding.weight) + + if shape is not None: # reshape back to match original input shape + z_q = z_q.view(shape).permute(0, 3, 1, 2).contiguous() + + return z_q + + +class GumbelQuantizer(nn.Module): + def __init__( + self, + codebook_size, + emb_dim, + num_hiddens, + straight_through=False, + kl_weight=5e-4, + temp_init=1.0, + ): + super().__init__() + self.codebook_size = codebook_size # number of embeddings + self.emb_dim = emb_dim # dimension of embedding + self.straight_through = straight_through + self.temperature = temp_init + self.kl_weight = kl_weight + self.proj = nn.Conv2d( + num_hiddens, codebook_size, 1 + ) # projects last encoder layer to quantized logits + self.embed = nn.Embedding(codebook_size, emb_dim) + + def forward(self, z): + hard = self.straight_through if self.training else True + + logits = self.proj(z) + + soft_one_hot = F.gumbel_softmax(logits, tau=self.temperature, dim=1, hard=hard) + + z_q = torch.einsum("b n h w, n d -> b d h w", soft_one_hot, self.embed.weight) + + # + kl divergence to the prior loss + qy = F.softmax(logits, dim=1) + diff = ( + self.kl_weight + * torch.sum(qy * torch.log(qy * self.codebook_size + 1e-10), dim=1).mean() + ) + min_encoding_indices = soft_one_hot.argmax(dim=1) + + return z_q, diff, {"min_encoding_indices": min_encoding_indices} + + +class Downsample(nn.Module): + def __init__(self, in_channels): + super().__init__() + self.conv = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=3, stride=2, padding=0 + ) + + def forward(self, x): + pad = (0, 1, 0, 1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x = self.conv(x) + return x + + +class Upsample(nn.Module): + def __init__(self, in_channels): + super().__init__() + self.conv = nn.Conv2d( + in_channels, in_channels, kernel_size=3, stride=1, padding=1 + ) + + def forward(self, x): + x = F.interpolate(x, scale_factor=2.0, mode="nearest") + x = self.conv(x) + + return x + + +class AttnBlock(nn.Module): + def __init__(self, in_channels): + super().__init__() + self.in_channels = in_channels + + self.norm = normalize(in_channels) + self.q = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.k = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.v = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.proj_out = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + + def forward(self, x): + h_ = x + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + # compute attention + b, c, h, w = q.shape + q = q.reshape(b, c, h * w) + q = q.permute(0, 2, 1) + k = k.reshape(b, c, h * w) + w_ = torch.bmm(q, k) + w_ = w_ * (int(c) ** (-0.5)) + w_ = F.softmax(w_, dim=2) + + # attend to values + v = v.reshape(b, c, h * w) + w_ = w_.permute(0, 2, 1) + h_ = torch.bmm(v, w_) + h_ = h_.reshape(b, c, h, w) + + h_ = self.proj_out(h_) + + return x + h_ + + +class Encoder(nn.Module): + def __init__( + self, + in_channels, + nf, + out_channels, + ch_mult, + num_res_blocks, + resolution, + attn_resolutions, + ): + super().__init__() + self.nf = nf + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.attn_resolutions = attn_resolutions + + curr_res = self.resolution + in_ch_mult = (1,) + tuple(ch_mult) + + blocks = [] + # initial convultion + blocks.append(nn.Conv2d(in_channels, nf, kernel_size=3, stride=1, padding=1)) + + # residual and downsampling blocks, with attention on smaller res (16x16) + for i in range(self.num_resolutions): + block_in_ch = nf * in_ch_mult[i] + block_out_ch = nf * ch_mult[i] + for _ in range(self.num_res_blocks): + blocks.append(ResBlock(block_in_ch, block_out_ch)) + block_in_ch = block_out_ch + if curr_res in attn_resolutions: + blocks.append(AttnBlock(block_in_ch)) + + if i != self.num_resolutions - 1: + blocks.append(Downsample(block_in_ch)) + curr_res = curr_res // 2 + + # non-local attention block + blocks.append(ResBlock(block_in_ch, block_in_ch)) # type: ignore + blocks.append(AttnBlock(block_in_ch)) # type: ignore + blocks.append(ResBlock(block_in_ch, block_in_ch)) # type: ignore + + # normalise and convert to latent size + blocks.append(normalize(block_in_ch)) # type: ignore + blocks.append( + nn.Conv2d(block_in_ch, out_channels, kernel_size=3, stride=1, padding=1) # type: ignore + ) + self.blocks = nn.ModuleList(blocks) + + def forward(self, x): + for block in self.blocks: + x = block(x) + + return x + + +class Generator(nn.Module): + def __init__(self, nf, ch_mult, res_blocks, img_size, attn_resolutions, emb_dim): + super().__init__() + self.nf = nf + self.ch_mult = ch_mult + self.num_resolutions = len(self.ch_mult) + self.num_res_blocks = res_blocks + self.resolution = img_size + self.attn_resolutions = attn_resolutions + self.in_channels = emb_dim + self.out_channels = 3 + block_in_ch = self.nf * self.ch_mult[-1] + curr_res = self.resolution // 2 ** (self.num_resolutions - 1) + + blocks = [] + # initial conv + blocks.append( + nn.Conv2d(self.in_channels, block_in_ch, kernel_size=3, stride=1, padding=1) + ) + + # non-local attention block + blocks.append(ResBlock(block_in_ch, block_in_ch)) + blocks.append(AttnBlock(block_in_ch)) + blocks.append(ResBlock(block_in_ch, block_in_ch)) + + for i in reversed(range(self.num_resolutions)): + block_out_ch = self.nf * self.ch_mult[i] + + for _ in range(self.num_res_blocks): + blocks.append(ResBlock(block_in_ch, block_out_ch)) + block_in_ch = block_out_ch + + if curr_res in self.attn_resolutions: + blocks.append(AttnBlock(block_in_ch)) + + if i != 0: + blocks.append(Upsample(block_in_ch)) + curr_res = curr_res * 2 + + blocks.append(normalize(block_in_ch)) + blocks.append( + nn.Conv2d( + block_in_ch, self.out_channels, kernel_size=3, stride=1, padding=1 + ) + ) + + self.blocks = nn.ModuleList(blocks) + + def forward(self, x): + for block in self.blocks: + x = block(x) + + return x + + +class VQAutoEncoder(nn.Module): + def __init__( + self, + img_size, + nf, + ch_mult, + quantizer="nearest", + res_blocks=2, + attn_resolutions=[16], + codebook_size=1024, + emb_dim=256, + beta=0.25, + gumbel_straight_through=False, + gumbel_kl_weight=1e-8, + model_path=None, + ): + super().__init__() + self.in_channels = 3 + self.nf = nf + self.n_blocks = res_blocks + self.codebook_size = codebook_size + self.embed_dim = emb_dim + self.ch_mult = ch_mult + self.resolution = img_size + self.attn_resolutions = attn_resolutions + self.quantizer_type = quantizer + self.encoder = Encoder( + self.in_channels, + self.nf, + self.embed_dim, + self.ch_mult, + self.n_blocks, + self.resolution, + self.attn_resolutions, + ) + if self.quantizer_type == "nearest": + self.beta = beta # 0.25 + self.quantize = VectorQuantizer( + self.codebook_size, self.embed_dim, self.beta + ) + elif self.quantizer_type == "gumbel": + self.gumbel_num_hiddens = emb_dim + self.straight_through = gumbel_straight_through + self.kl_weight = gumbel_kl_weight + self.quantize = GumbelQuantizer( + self.codebook_size, + self.embed_dim, + self.gumbel_num_hiddens, + self.straight_through, + self.kl_weight, + ) + self.generator = Generator( + nf, ch_mult, res_blocks, img_size, attn_resolutions, emb_dim + ) + + if model_path is not None: + chkpt = torch.load(model_path, map_location="cpu") + if "params_ema" in chkpt: + self.load_state_dict( + torch.load(model_path, map_location="cpu")["params_ema"] + ) + logger.info(f"vqgan is loaded from: {model_path} [params_ema]") + elif "params" in chkpt: + self.load_state_dict( + torch.load(model_path, map_location="cpu")["params"] + ) + logger.info(f"vqgan is loaded from: {model_path} [params]") + else: + raise ValueError("Wrong params!") + + def forward(self, x): + x = self.encoder(x) + quant, codebook_loss, quant_stats = self.quantize(x) + x = self.generator(quant) + return x, codebook_loss, quant_stats + + +def calc_mean_std(feat, eps=1e-5): + """Calculate mean and std for adaptive_instance_normalization. + Args: + feat (Tensor): 4D tensor. + eps (float): A small value added to the variance to avoid + divide-by-zero. Default: 1e-5. + """ + size = feat.size() + assert len(size) == 4, "The input feature should be 4D tensor." + b, c = size[:2] + feat_var = feat.view(b, c, -1).var(dim=2) + eps + feat_std = feat_var.sqrt().view(b, c, 1, 1) + feat_mean = feat.view(b, c, -1).mean(dim=2).view(b, c, 1, 1) + return feat_mean, feat_std + + +def adaptive_instance_normalization(content_feat, style_feat): + """Adaptive instance normalization. + Adjust the reference features to have the similar color and illuminations + as those in the degradate features. + Args: + content_feat (Tensor): The reference feature. + style_feat (Tensor): The degradate features. + """ + size = content_feat.size() + style_mean, style_std = calc_mean_std(style_feat) + content_mean, content_std = calc_mean_std(content_feat) + normalized_feat = (content_feat - content_mean.expand(size)) / content_std.expand( + size + ) + return normalized_feat * style_std.expand(size) + style_mean.expand(size) + + +class PositionEmbeddingSine(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one + used by the Attention is all you need paper, generalized to work on images. + """ + + def __init__( + self, num_pos_feats=64, temperature=10000, normalize=False, scale=None + ): + super().__init__() + self.num_pos_feats = num_pos_feats + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, x, mask=None): + if mask is None: + mask = torch.zeros( + (x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool + ) + not_mask = ~mask # pylint: disable=invalid-unary-operand-type + y_embed = not_mask.cumsum(1, dtype=torch.float32) + x_embed = not_mask.cumsum(2, dtype=torch.float32) + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos_y = torch.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +def _get_activation_fn(activation): + """Return an activation function given a string""" + if activation == "relu": + return F.relu + if activation == "gelu": + return F.gelu + if activation == "glu": + return F.glu + raise RuntimeError(f"activation should be relu/gelu, not {activation}.") + + +class TransformerSALayer(nn.Module): + def __init__( + self, embed_dim, nhead=8, dim_mlp=2048, dropout=0.0, activation="gelu" + ): + super().__init__() + self.self_attn = nn.MultiheadAttention(embed_dim, nhead, dropout=dropout) + # Implementation of Feedforward model - MLP + self.linear1 = nn.Linear(embed_dim, dim_mlp) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_mlp, embed_dim) + + self.norm1 = nn.LayerNorm(embed_dim) + self.norm2 = nn.LayerNorm(embed_dim) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward( + self, + tgt, + tgt_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None, + ): + # self attention + tgt2 = self.norm1(tgt) + q = k = self.with_pos_embed(tgt2, query_pos) + tgt2 = self.self_attn( + q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask + )[0] + tgt = tgt + self.dropout1(tgt2) + + # ffn + tgt2 = self.norm2(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) + tgt = tgt + self.dropout2(tgt2) + return tgt + + +def normalize(in_channels): + return torch.nn.GroupNorm( + num_groups=32, num_channels=in_channels, eps=1e-6, affine=True + ) + + +@torch.jit.script # type: ignore +def swish(x): + return x * torch.sigmoid(x) + + +class ResBlock(nn.Module): + def __init__(self, in_channels, out_channels=None): + super(ResBlock, self).__init__() + self.in_channels = in_channels + self.out_channels = in_channels if out_channels is None else out_channels + self.norm1 = normalize(in_channels) + self.conv1 = nn.Conv2d( + in_channels, out_channels, kernel_size=3, stride=1, padding=1 # type: ignore + ) + self.norm2 = normalize(out_channels) + self.conv2 = nn.Conv2d( + out_channels, out_channels, kernel_size=3, stride=1, padding=1 # type: ignore + ) + if self.in_channels != self.out_channels: + self.conv_out = nn.Conv2d( + in_channels, out_channels, kernel_size=1, stride=1, padding=0 # type: ignore + ) + + def forward(self, x_in): + x = x_in + x = self.norm1(x) + x = swish(x) + x = self.conv1(x) + x = self.norm2(x) + x = swish(x) + x = self.conv2(x) + if self.in_channels != self.out_channels: + x_in = self.conv_out(x_in) + + return x + x_in + + +class Fuse_sft_block(nn.Module): + def __init__(self, in_ch, out_ch): + super().__init__() + self.encode_enc = ResBlock(2 * in_ch, out_ch) + + self.scale = nn.Sequential( + nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1), + nn.LeakyReLU(0.2, True), + nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1), + ) + + self.shift = nn.Sequential( + nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=1), + nn.LeakyReLU(0.2, True), + nn.Conv2d(out_ch, out_ch, kernel_size=3, padding=1), + ) + + def forward(self, enc_feat, dec_feat, w=1): + enc_feat = self.encode_enc(torch.cat([enc_feat, dec_feat], dim=1)) + scale = self.scale(enc_feat) + shift = self.shift(enc_feat) + residual = w * (dec_feat * scale + shift) + out = dec_feat + residual + return out + + +class CodeFormer(VQAutoEncoder): + def __init__(self, state_dict): + dim_embd = 512 + n_head = 8 + n_layers = 9 + codebook_size = 1024 + latent_size = 256 + connect_list = ["32", "64", "128", "256"] + fix_modules = ["quantize", "generator"] + + # This is just a guess as I only have one model to look at + position_emb = state_dict["position_emb"] + dim_embd = position_emb.shape[1] + latent_size = position_emb.shape[0] + + try: + n_layers = len( + set([x.split(".")[1] for x in state_dict.keys() if "ft_layers" in x]) + ) + except: + pass + + codebook_size = state_dict["quantize.embedding.weight"].shape[0] + + # This is also just another guess + n_head_exp = ( + state_dict["ft_layers.0.self_attn.in_proj_weight"].shape[0] // dim_embd + ) + n_head = 2**n_head_exp + + in_nc = state_dict["encoder.blocks.0.weight"].shape[1] + + self.model_arch = "CodeFormer" + self.sub_type = "Face SR" + self.scale = 8 + self.in_nc = in_nc + self.out_nc = in_nc + + self.state = state_dict + + self.supports_fp16 = False + self.supports_bf16 = True + self.min_size_restriction = 16 + + super(CodeFormer, self).__init__( + 512, 64, [1, 2, 2, 4, 4, 8], "nearest", 2, [16], codebook_size + ) + + if fix_modules is not None: + for module in fix_modules: + for param in getattr(self, module).parameters(): + param.requires_grad = False + + self.connect_list = connect_list + self.n_layers = n_layers + self.dim_embd = dim_embd + self.dim_mlp = dim_embd * 2 + + self.position_emb = nn.Parameter(torch.zeros(latent_size, self.dim_embd)) # type: ignore + self.feat_emb = nn.Linear(256, self.dim_embd) + + # transformer + self.ft_layers = nn.Sequential( + *[ + TransformerSALayer( + embed_dim=dim_embd, nhead=n_head, dim_mlp=self.dim_mlp, dropout=0.0 + ) + for _ in range(self.n_layers) + ] + ) + + # logits_predict head + self.idx_pred_layer = nn.Sequential( + nn.LayerNorm(dim_embd), nn.Linear(dim_embd, codebook_size, bias=False) + ) + + self.channels = { + "16": 512, + "32": 256, + "64": 256, + "128": 128, + "256": 128, + "512": 64, + } + + # after second residual block for > 16, before attn layer for ==16 + self.fuse_encoder_block = { + "512": 2, + "256": 5, + "128": 8, + "64": 11, + "32": 14, + "16": 18, + } + # after first residual block for > 16, before attn layer for ==16 + self.fuse_generator_block = { + "16": 6, + "32": 9, + "64": 12, + "128": 15, + "256": 18, + "512": 21, + } + + # fuse_convs_dict + self.fuse_convs_dict = nn.ModuleDict() + for f_size in self.connect_list: + in_ch = self.channels[f_size] + self.fuse_convs_dict[f_size] = Fuse_sft_block(in_ch, in_ch) + + self.load_state_dict(state_dict) + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.02) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def forward(self, x, weight=0.5, **kwargs): + detach_16 = True + code_only = False + adain = True + # ################### Encoder ##################### + enc_feat_dict = {} + out_list = [self.fuse_encoder_block[f_size] for f_size in self.connect_list] + for i, block in enumerate(self.encoder.blocks): + x = block(x) + if i in out_list: + enc_feat_dict[str(x.shape[-1])] = x.clone() + + lq_feat = x + # ################# Transformer ################### + # quant_feat, codebook_loss, quant_stats = self.quantize(lq_feat) + pos_emb = self.position_emb.unsqueeze(1).repeat(1, x.shape[0], 1) + # BCHW -> BC(HW) -> (HW)BC + feat_emb = self.feat_emb(lq_feat.flatten(2).permute(2, 0, 1)) + query_emb = feat_emb + # Transformer encoder + for layer in self.ft_layers: + query_emb = layer(query_emb, query_pos=pos_emb) + + # output logits + logits = self.idx_pred_layer(query_emb) # (hw)bn + logits = logits.permute(1, 0, 2) # (hw)bn -> b(hw)n + + if code_only: # for training stage II + # logits doesn't need softmax before cross_entropy loss + return logits, lq_feat + + # ################# Quantization ################### + # if self.training: + # quant_feat = torch.einsum('btn,nc->btc', [soft_one_hot, self.quantize.embedding.weight]) + # # b(hw)c -> bc(hw) -> bchw + # quant_feat = quant_feat.permute(0,2,1).view(lq_feat.shape) + # ------------ + soft_one_hot = F.softmax(logits, dim=2) + _, top_idx = torch.topk(soft_one_hot, 1, dim=2) + quant_feat = self.quantize.get_codebook_feat( + top_idx, shape=[x.shape[0], 16, 16, 256] # type: ignore + ) + # preserve gradients + # quant_feat = lq_feat + (quant_feat - lq_feat).detach() + + if detach_16: + quant_feat = quant_feat.detach() # for training stage III + if adain: + quant_feat = adaptive_instance_normalization(quant_feat, lq_feat) + + # ################## Generator #################### + x = quant_feat + fuse_list = [self.fuse_generator_block[f_size] for f_size in self.connect_list] + + for i, block in enumerate(self.generator.blocks): + x = block(x) + if i in fuse_list: # fuse after i-th block + f_size = str(x.shape[-1]) + if weight > 0: + x = self.fuse_convs_dict[f_size]( + enc_feat_dict[f_size].detach(), x, weight + ) + out = x + # logits doesn't need softmax before cross_entropy loss + # return out, logits, lq_feat + return out, logits diff --git a/ldm_patched/pfn/architecture/face/fused_act.py b/ldm_patched/pfn/architecture/face/fused_act.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed526547b4644ac6341947a801b76d9ed798f26 --- /dev/null +++ b/ldm_patched/pfn/architecture/face/fused_act.py @@ -0,0 +1,81 @@ +# pylint: skip-file +# type: ignore +# modify from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_act.py # noqa:E501 + +import torch +from torch import nn +from torch.autograd import Function + +fused_act_ext = None + + +class FusedLeakyReLUFunctionBackward(Function): + @staticmethod + def forward(ctx, grad_output, out, negative_slope, scale): + ctx.save_for_backward(out) + ctx.negative_slope = negative_slope + ctx.scale = scale + + empty = grad_output.new_empty(0) + + grad_input = fused_act_ext.fused_bias_act( + grad_output, empty, out, 3, 1, negative_slope, scale + ) + + dim = [0] + + if grad_input.ndim > 2: + dim += list(range(2, grad_input.ndim)) + + grad_bias = grad_input.sum(dim).detach() + + return grad_input, grad_bias + + @staticmethod + def backward(ctx, gradgrad_input, gradgrad_bias): + (out,) = ctx.saved_tensors + gradgrad_out = fused_act_ext.fused_bias_act( + gradgrad_input, gradgrad_bias, out, 3, 1, ctx.negative_slope, ctx.scale + ) + + return gradgrad_out, None, None, None + + +class FusedLeakyReLUFunction(Function): + @staticmethod + def forward(ctx, input, bias, negative_slope, scale): + empty = input.new_empty(0) + out = fused_act_ext.fused_bias_act( + input, bias, empty, 3, 0, negative_slope, scale + ) + ctx.save_for_backward(out) + ctx.negative_slope = negative_slope + ctx.scale = scale + + return out + + @staticmethod + def backward(ctx, grad_output): + (out,) = ctx.saved_tensors + + grad_input, grad_bias = FusedLeakyReLUFunctionBackward.apply( + grad_output, out, ctx.negative_slope, ctx.scale + ) + + return grad_input, grad_bias, None, None + + +class FusedLeakyReLU(nn.Module): + def __init__(self, channel, negative_slope=0.2, scale=2**0.5): + super().__init__() + + self.bias = nn.Parameter(torch.zeros(channel)) + self.negative_slope = negative_slope + self.scale = scale + + def forward(self, input): + return fused_leaky_relu(input, self.bias, self.negative_slope, self.scale) + + +def fused_leaky_relu(input, bias, negative_slope=0.2, scale=2**0.5): + return FusedLeakyReLUFunction.apply(input, bias, negative_slope, scale) diff --git a/ldm_patched/pfn/architecture/face/gfpgan_bilinear_arch.py b/ldm_patched/pfn/architecture/face/gfpgan_bilinear_arch.py new file mode 100644 index 0000000000000000000000000000000000000000..b6e820e006f52936c3399d3d37fdf571f2385dcb --- /dev/null +++ b/ldm_patched/pfn/architecture/face/gfpgan_bilinear_arch.py @@ -0,0 +1,389 @@ +# pylint: skip-file +# type: ignore +import math +import random + +import torch +from torch import nn + +from .gfpganv1_arch import ResUpBlock +from .stylegan2_bilinear_arch import ( + ConvLayer, + EqualConv2d, + EqualLinear, + ResBlock, + ScaledLeakyReLU, + StyleGAN2GeneratorBilinear, +) + + +class StyleGAN2GeneratorBilinearSFT(StyleGAN2GeneratorBilinear): + """StyleGAN2 Generator with SFT modulation (Spatial Feature Transform). + It is the bilinear version. It does not use the complicated UpFirDnSmooth function that is not friendly for + deployment. It can be easily converted to the clean version: StyleGAN2GeneratorCSFT. + Args: + out_size (int): The spatial size of outputs. + num_style_feat (int): Channel number of style features. Default: 512. + num_mlp (int): Layer number of MLP style layers. Default: 8. + channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2. + lr_mlp (float): Learning rate multiplier for mlp layers. Default: 0.01. + narrow (float): The narrow ratio for channels. Default: 1. + sft_half (bool): Whether to apply SFT on half of the input channels. Default: False. + """ + + def __init__( + self, + out_size, + num_style_feat=512, + num_mlp=8, + channel_multiplier=2, + lr_mlp=0.01, + narrow=1, + sft_half=False, + ): + super(StyleGAN2GeneratorBilinearSFT, self).__init__( + out_size, + num_style_feat=num_style_feat, + num_mlp=num_mlp, + channel_multiplier=channel_multiplier, + lr_mlp=lr_mlp, + narrow=narrow, + ) + self.sft_half = sft_half + + def forward( + self, + styles, + conditions, + input_is_latent=False, + noise=None, + randomize_noise=True, + truncation=1, + truncation_latent=None, + inject_index=None, + return_latents=False, + ): + """Forward function for StyleGAN2GeneratorBilinearSFT. + Args: + styles (list[Tensor]): Sample codes of styles. + conditions (list[Tensor]): SFT conditions to generators. + input_is_latent (bool): Whether input is latent style. Default: False. + noise (Tensor | None): Input noise or None. Default: None. + randomize_noise (bool): Randomize noise, used when 'noise' is False. Default: True. + truncation (float): The truncation ratio. Default: 1. + truncation_latent (Tensor | None): The truncation latent tensor. Default: None. + inject_index (int | None): The injection index for mixing noise. Default: None. + return_latents (bool): Whether to return style latents. Default: False. + """ + # style codes -> latents with Style MLP layer + if not input_is_latent: + styles = [self.style_mlp(s) for s in styles] + # noises + if noise is None: + if randomize_noise: + noise = [None] * self.num_layers # for each style conv layer + else: # use the stored noise + noise = [ + getattr(self.noises, f"noise{i}") for i in range(self.num_layers) + ] + # style truncation + if truncation < 1: + style_truncation = [] + for style in styles: + style_truncation.append( + truncation_latent + truncation * (style - truncation_latent) + ) + styles = style_truncation + # get style latents with injection + if len(styles) == 1: + inject_index = self.num_latent + + if styles[0].ndim < 3: + # repeat latent code for all the layers + latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1) + else: # used for encoder with different latent code for each layer + latent = styles[0] + elif len(styles) == 2: # mixing noises + if inject_index is None: + inject_index = random.randint(1, self.num_latent - 1) + latent1 = styles[0].unsqueeze(1).repeat(1, inject_index, 1) + latent2 = ( + styles[1].unsqueeze(1).repeat(1, self.num_latent - inject_index, 1) + ) + latent = torch.cat([latent1, latent2], 1) + + # main generation + out = self.constant_input(latent.shape[0]) + out = self.style_conv1(out, latent[:, 0], noise=noise[0]) + skip = self.to_rgb1(out, latent[:, 1]) + + i = 1 + for conv1, conv2, noise1, noise2, to_rgb in zip( + self.style_convs[::2], + self.style_convs[1::2], + noise[1::2], + noise[2::2], + self.to_rgbs, + ): + out = conv1(out, latent[:, i], noise=noise1) + + # the conditions may have fewer levels + if i < len(conditions): + # SFT part to combine the conditions + if self.sft_half: # only apply SFT to half of the channels + out_same, out_sft = torch.split(out, int(out.size(1) // 2), dim=1) + out_sft = out_sft * conditions[i - 1] + conditions[i] + out = torch.cat([out_same, out_sft], dim=1) + else: # apply SFT to all the channels + out = out * conditions[i - 1] + conditions[i] + + out = conv2(out, latent[:, i + 1], noise=noise2) + skip = to_rgb(out, latent[:, i + 2], skip) # feature back to the rgb space + i += 2 + + image = skip + + if return_latents: + return image, latent + else: + return image, None + + +class GFPGANBilinear(nn.Module): + """The GFPGAN architecture: Unet + StyleGAN2 decoder with SFT. + It is the bilinear version and it does not use the complicated UpFirDnSmooth function that is not friendly for + deployment. It can be easily converted to the clean version: GFPGANv1Clean. + Ref: GFP-GAN: Towards Real-World Blind Face Restoration with Generative Facial Prior. + Args: + out_size (int): The spatial size of outputs. + num_style_feat (int): Channel number of style features. Default: 512. + channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2. + decoder_load_path (str): The path to the pre-trained decoder model (usually, the StyleGAN2). Default: None. + fix_decoder (bool): Whether to fix the decoder. Default: True. + num_mlp (int): Layer number of MLP style layers. Default: 8. + lr_mlp (float): Learning rate multiplier for mlp layers. Default: 0.01. + input_is_latent (bool): Whether input is latent style. Default: False. + different_w (bool): Whether to use different latent w for different layers. Default: False. + narrow (float): The narrow ratio for channels. Default: 1. + sft_half (bool): Whether to apply SFT on half of the input channels. Default: False. + """ + + def __init__( + self, + out_size, + num_style_feat=512, + channel_multiplier=1, + decoder_load_path=None, + fix_decoder=True, + # for stylegan decoder + num_mlp=8, + lr_mlp=0.01, + input_is_latent=False, + different_w=False, + narrow=1, + sft_half=False, + ): + super(GFPGANBilinear, self).__init__() + self.input_is_latent = input_is_latent + self.different_w = different_w + self.num_style_feat = num_style_feat + self.min_size_restriction = 512 + + unet_narrow = narrow * 0.5 # by default, use a half of input channels + channels = { + "4": int(512 * unet_narrow), + "8": int(512 * unet_narrow), + "16": int(512 * unet_narrow), + "32": int(512 * unet_narrow), + "64": int(256 * channel_multiplier * unet_narrow), + "128": int(128 * channel_multiplier * unet_narrow), + "256": int(64 * channel_multiplier * unet_narrow), + "512": int(32 * channel_multiplier * unet_narrow), + "1024": int(16 * channel_multiplier * unet_narrow), + } + + self.log_size = int(math.log(out_size, 2)) + first_out_size = 2 ** (int(math.log(out_size, 2))) + + self.conv_body_first = ConvLayer( + 3, channels[f"{first_out_size}"], 1, bias=True, activate=True + ) + + # downsample + in_channels = channels[f"{first_out_size}"] + self.conv_body_down = nn.ModuleList() + for i in range(self.log_size, 2, -1): + out_channels = channels[f"{2**(i - 1)}"] + self.conv_body_down.append(ResBlock(in_channels, out_channels)) + in_channels = out_channels + + self.final_conv = ConvLayer( + in_channels, channels["4"], 3, bias=True, activate=True + ) + + # upsample + in_channels = channels["4"] + self.conv_body_up = nn.ModuleList() + for i in range(3, self.log_size + 1): + out_channels = channels[f"{2**i}"] + self.conv_body_up.append(ResUpBlock(in_channels, out_channels)) + in_channels = out_channels + + # to RGB + self.toRGB = nn.ModuleList() + for i in range(3, self.log_size + 1): + self.toRGB.append( + EqualConv2d( + channels[f"{2**i}"], + 3, + 1, + stride=1, + padding=0, + bias=True, + bias_init_val=0, + ) + ) + + if different_w: + linear_out_channel = (int(math.log(out_size, 2)) * 2 - 2) * num_style_feat + else: + linear_out_channel = num_style_feat + + self.final_linear = EqualLinear( + channels["4"] * 4 * 4, + linear_out_channel, + bias=True, + bias_init_val=0, + lr_mul=1, + activation=None, + ) + + # the decoder: stylegan2 generator with SFT modulations + self.stylegan_decoder = StyleGAN2GeneratorBilinearSFT( + out_size=out_size, + num_style_feat=num_style_feat, + num_mlp=num_mlp, + channel_multiplier=channel_multiplier, + lr_mlp=lr_mlp, + narrow=narrow, + sft_half=sft_half, + ) + + # load pre-trained stylegan2 model if necessary + if decoder_load_path: + self.stylegan_decoder.load_state_dict( + torch.load( + decoder_load_path, map_location=lambda storage, loc: storage + )["params_ema"] + ) + # fix decoder without updating params + if fix_decoder: + for _, param in self.stylegan_decoder.named_parameters(): + param.requires_grad = False + + # for SFT modulations (scale and shift) + self.condition_scale = nn.ModuleList() + self.condition_shift = nn.ModuleList() + for i in range(3, self.log_size + 1): + out_channels = channels[f"{2**i}"] + if sft_half: + sft_out_channels = out_channels + else: + sft_out_channels = out_channels * 2 + self.condition_scale.append( + nn.Sequential( + EqualConv2d( + out_channels, + out_channels, + 3, + stride=1, + padding=1, + bias=True, + bias_init_val=0, + ), + ScaledLeakyReLU(0.2), + EqualConv2d( + out_channels, + sft_out_channels, + 3, + stride=1, + padding=1, + bias=True, + bias_init_val=1, + ), + ) + ) + self.condition_shift.append( + nn.Sequential( + EqualConv2d( + out_channels, + out_channels, + 3, + stride=1, + padding=1, + bias=True, + bias_init_val=0, + ), + ScaledLeakyReLU(0.2), + EqualConv2d( + out_channels, + sft_out_channels, + 3, + stride=1, + padding=1, + bias=True, + bias_init_val=0, + ), + ) + ) + + def forward(self, x, return_latents=False, return_rgb=True, randomize_noise=True): + """Forward function for GFPGANBilinear. + Args: + x (Tensor): Input images. + return_latents (bool): Whether to return style latents. Default: False. + return_rgb (bool): Whether return intermediate rgb images. Default: True. + randomize_noise (bool): Randomize noise, used when 'noise' is False. Default: True. + """ + conditions = [] + unet_skips = [] + out_rgbs = [] + + # encoder + feat = self.conv_body_first(x) + for i in range(self.log_size - 2): + feat = self.conv_body_down[i](feat) + unet_skips.insert(0, feat) + + feat = self.final_conv(feat) + + # style code + style_code = self.final_linear(feat.view(feat.size(0), -1)) + if self.different_w: + style_code = style_code.view(style_code.size(0), -1, self.num_style_feat) + + # decode + for i in range(self.log_size - 2): + # add unet skip + feat = feat + unet_skips[i] + # ResUpLayer + feat = self.conv_body_up[i](feat) + # generate scale and shift for SFT layers + scale = self.condition_scale[i](feat) + conditions.append(scale.clone()) + shift = self.condition_shift[i](feat) + conditions.append(shift.clone()) + # generate rgb images + if return_rgb: + out_rgbs.append(self.toRGB[i](feat)) + + # decoder + image, _ = self.stylegan_decoder( + [style_code], + conditions, + return_latents=return_latents, + input_is_latent=self.input_is_latent, + randomize_noise=randomize_noise, + ) + + return image, out_rgbs diff --git a/ldm_patched/pfn/architecture/face/gfpganv1_arch.py b/ldm_patched/pfn/architecture/face/gfpganv1_arch.py new file mode 100644 index 0000000000000000000000000000000000000000..72d72fc865ec35b2ccd23f13b3d8ef0be5dbaf7a --- /dev/null +++ b/ldm_patched/pfn/architecture/face/gfpganv1_arch.py @@ -0,0 +1,566 @@ +# pylint: skip-file +# type: ignore +import math +import random + +import torch +from torch import nn +from torch.nn import functional as F + +from .fused_act import FusedLeakyReLU +from .stylegan2_arch import ( + ConvLayer, + EqualConv2d, + EqualLinear, + ResBlock, + ScaledLeakyReLU, + StyleGAN2Generator, +) + + +class StyleGAN2GeneratorSFT(StyleGAN2Generator): + """StyleGAN2 Generator with SFT modulation (Spatial Feature Transform). + Args: + out_size (int): The spatial size of outputs. + num_style_feat (int): Channel number of style features. Default: 512. + num_mlp (int): Layer number of MLP style layers. Default: 8. + channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2. + resample_kernel (list[int]): A list indicating the 1D resample kernel magnitude. A cross production will be + applied to extent 1D resample kernel to 2D resample kernel. Default: (1, 3, 3, 1). + lr_mlp (float): Learning rate multiplier for mlp layers. Default: 0.01. + narrow (float): The narrow ratio for channels. Default: 1. + sft_half (bool): Whether to apply SFT on half of the input channels. Default: False. + """ + + def __init__( + self, + out_size, + num_style_feat=512, + num_mlp=8, + channel_multiplier=2, + resample_kernel=(1, 3, 3, 1), + lr_mlp=0.01, + narrow=1, + sft_half=False, + ): + super(StyleGAN2GeneratorSFT, self).__init__( + out_size, + num_style_feat=num_style_feat, + num_mlp=num_mlp, + channel_multiplier=channel_multiplier, + resample_kernel=resample_kernel, + lr_mlp=lr_mlp, + narrow=narrow, + ) + self.sft_half = sft_half + + def forward( + self, + styles, + conditions, + input_is_latent=False, + noise=None, + randomize_noise=True, + truncation=1, + truncation_latent=None, + inject_index=None, + return_latents=False, + ): + """Forward function for StyleGAN2GeneratorSFT. + Args: + styles (list[Tensor]): Sample codes of styles. + conditions (list[Tensor]): SFT conditions to generators. + input_is_latent (bool): Whether input is latent style. Default: False. + noise (Tensor | None): Input noise or None. Default: None. + randomize_noise (bool): Randomize noise, used when 'noise' is False. Default: True. + truncation (float): The truncation ratio. Default: 1. + truncation_latent (Tensor | None): The truncation latent tensor. Default: None. + inject_index (int | None): The injection index for mixing noise. Default: None. + return_latents (bool): Whether to return style latents. Default: False. + """ + # style codes -> latents with Style MLP layer + if not input_is_latent: + styles = [self.style_mlp(s) for s in styles] + # noises + if noise is None: + if randomize_noise: + noise = [None] * self.num_layers # for each style conv layer + else: # use the stored noise + noise = [ + getattr(self.noises, f"noise{i}") for i in range(self.num_layers) + ] + # style truncation + if truncation < 1: + style_truncation = [] + for style in styles: + style_truncation.append( + truncation_latent + truncation * (style - truncation_latent) + ) + styles = style_truncation + # get style latents with injection + if len(styles) == 1: + inject_index = self.num_latent + + if styles[0].ndim < 3: + # repeat latent code for all the layers + latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1) + else: # used for encoder with different latent code for each layer + latent = styles[0] + elif len(styles) == 2: # mixing noises + if inject_index is None: + inject_index = random.randint(1, self.num_latent - 1) + latent1 = styles[0].unsqueeze(1).repeat(1, inject_index, 1) + latent2 = ( + styles[1].unsqueeze(1).repeat(1, self.num_latent - inject_index, 1) + ) + latent = torch.cat([latent1, latent2], 1) + + # main generation + out = self.constant_input(latent.shape[0]) + out = self.style_conv1(out, latent[:, 0], noise=noise[0]) + skip = self.to_rgb1(out, latent[:, 1]) + + i = 1 + for conv1, conv2, noise1, noise2, to_rgb in zip( + self.style_convs[::2], + self.style_convs[1::2], + noise[1::2], + noise[2::2], + self.to_rgbs, + ): + out = conv1(out, latent[:, i], noise=noise1) + + # the conditions may have fewer levels + if i < len(conditions): + # SFT part to combine the conditions + if self.sft_half: # only apply SFT to half of the channels + out_same, out_sft = torch.split(out, int(out.size(1) // 2), dim=1) + out_sft = out_sft * conditions[i - 1] + conditions[i] + out = torch.cat([out_same, out_sft], dim=1) + else: # apply SFT to all the channels + out = out * conditions[i - 1] + conditions[i] + + out = conv2(out, latent[:, i + 1], noise=noise2) + skip = to_rgb(out, latent[:, i + 2], skip) # feature back to the rgb space + i += 2 + + image = skip + + if return_latents: + return image, latent + else: + return image, None + + +class ConvUpLayer(nn.Module): + """Convolutional upsampling layer. It uses bilinear upsampler + Conv. + Args: + in_channels (int): Channel number of the input. + out_channels (int): Channel number of the output. + kernel_size (int): Size of the convolving kernel. + stride (int): Stride of the convolution. Default: 1 + padding (int): Zero-padding added to both sides of the input. Default: 0. + bias (bool): If ``True``, adds a learnable bias to the output. Default: ``True``. + bias_init_val (float): Bias initialized value. Default: 0. + activate (bool): Whether use activateion. Default: True. + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + bias=True, + bias_init_val=0, + activate=True, + ): + super(ConvUpLayer, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + # self.scale is used to scale the convolution weights, which is related to the common initializations. + self.scale = 1 / math.sqrt(in_channels * kernel_size**2) + + self.weight = nn.Parameter( + torch.randn(out_channels, in_channels, kernel_size, kernel_size) + ) + + if bias and not activate: + self.bias = nn.Parameter(torch.zeros(out_channels).fill_(bias_init_val)) + else: + self.register_parameter("bias", None) + + # activation + if activate: + if bias: + self.activation = FusedLeakyReLU(out_channels) + else: + self.activation = ScaledLeakyReLU(0.2) + else: + self.activation = None + + def forward(self, x): + # bilinear upsample + out = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=False) + # conv + out = F.conv2d( + out, + self.weight * self.scale, + bias=self.bias, + stride=self.stride, + padding=self.padding, + ) + # activation + if self.activation is not None: + out = self.activation(out) + return out + + +class ResUpBlock(nn.Module): + """Residual block with upsampling. + Args: + in_channels (int): Channel number of the input. + out_channels (int): Channel number of the output. + """ + + def __init__(self, in_channels, out_channels): + super(ResUpBlock, self).__init__() + + self.conv1 = ConvLayer(in_channels, in_channels, 3, bias=True, activate=True) + self.conv2 = ConvUpLayer( + in_channels, out_channels, 3, stride=1, padding=1, bias=True, activate=True + ) + self.skip = ConvUpLayer( + in_channels, out_channels, 1, bias=False, activate=False + ) + + def forward(self, x): + out = self.conv1(x) + out = self.conv2(out) + skip = self.skip(x) + out = (out + skip) / math.sqrt(2) + return out + + +class GFPGANv1(nn.Module): + """The GFPGAN architecture: Unet + StyleGAN2 decoder with SFT. + Ref: GFP-GAN: Towards Real-World Blind Face Restoration with Generative Facial Prior. + Args: + out_size (int): The spatial size of outputs. + num_style_feat (int): Channel number of style features. Default: 512. + channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2. + resample_kernel (list[int]): A list indicating the 1D resample kernel magnitude. A cross production will be + applied to extent 1D resample kernel to 2D resample kernel. Default: (1, 3, 3, 1). + decoder_load_path (str): The path to the pre-trained decoder model (usually, the StyleGAN2). Default: None. + fix_decoder (bool): Whether to fix the decoder. Default: True. + num_mlp (int): Layer number of MLP style layers. Default: 8. + lr_mlp (float): Learning rate multiplier for mlp layers. Default: 0.01. + input_is_latent (bool): Whether input is latent style. Default: False. + different_w (bool): Whether to use different latent w for different layers. Default: False. + narrow (float): The narrow ratio for channels. Default: 1. + sft_half (bool): Whether to apply SFT on half of the input channels. Default: False. + """ + + def __init__( + self, + out_size, + num_style_feat=512, + channel_multiplier=1, + resample_kernel=(1, 3, 3, 1), + decoder_load_path=None, + fix_decoder=True, + # for stylegan decoder + num_mlp=8, + lr_mlp=0.01, + input_is_latent=False, + different_w=False, + narrow=1, + sft_half=False, + ): + super(GFPGANv1, self).__init__() + self.input_is_latent = input_is_latent + self.different_w = different_w + self.num_style_feat = num_style_feat + + unet_narrow = narrow * 0.5 # by default, use a half of input channels + channels = { + "4": int(512 * unet_narrow), + "8": int(512 * unet_narrow), + "16": int(512 * unet_narrow), + "32": int(512 * unet_narrow), + "64": int(256 * channel_multiplier * unet_narrow), + "128": int(128 * channel_multiplier * unet_narrow), + "256": int(64 * channel_multiplier * unet_narrow), + "512": int(32 * channel_multiplier * unet_narrow), + "1024": int(16 * channel_multiplier * unet_narrow), + } + + self.log_size = int(math.log(out_size, 2)) + first_out_size = 2 ** (int(math.log(out_size, 2))) + + self.conv_body_first = ConvLayer( + 3, channels[f"{first_out_size}"], 1, bias=True, activate=True + ) + + # downsample + in_channels = channels[f"{first_out_size}"] + self.conv_body_down = nn.ModuleList() + for i in range(self.log_size, 2, -1): + out_channels = channels[f"{2**(i - 1)}"] + self.conv_body_down.append( + ResBlock(in_channels, out_channels, resample_kernel) + ) + in_channels = out_channels + + self.final_conv = ConvLayer( + in_channels, channels["4"], 3, bias=True, activate=True + ) + + # upsample + in_channels = channels["4"] + self.conv_body_up = nn.ModuleList() + for i in range(3, self.log_size + 1): + out_channels = channels[f"{2**i}"] + self.conv_body_up.append(ResUpBlock(in_channels, out_channels)) + in_channels = out_channels + + # to RGB + self.toRGB = nn.ModuleList() + for i in range(3, self.log_size + 1): + self.toRGB.append( + EqualConv2d( + channels[f"{2**i}"], + 3, + 1, + stride=1, + padding=0, + bias=True, + bias_init_val=0, + ) + ) + + if different_w: + linear_out_channel = (int(math.log(out_size, 2)) * 2 - 2) * num_style_feat + else: + linear_out_channel = num_style_feat + + self.final_linear = EqualLinear( + channels["4"] * 4 * 4, + linear_out_channel, + bias=True, + bias_init_val=0, + lr_mul=1, + activation=None, + ) + + # the decoder: stylegan2 generator with SFT modulations + self.stylegan_decoder = StyleGAN2GeneratorSFT( + out_size=out_size, + num_style_feat=num_style_feat, + num_mlp=num_mlp, + channel_multiplier=channel_multiplier, + resample_kernel=resample_kernel, + lr_mlp=lr_mlp, + narrow=narrow, + sft_half=sft_half, + ) + + # load pre-trained stylegan2 model if necessary + if decoder_load_path: + self.stylegan_decoder.load_state_dict( + torch.load( + decoder_load_path, map_location=lambda storage, loc: storage + )["params_ema"] + ) + # fix decoder without updating params + if fix_decoder: + for _, param in self.stylegan_decoder.named_parameters(): + param.requires_grad = False + + # for SFT modulations (scale and shift) + self.condition_scale = nn.ModuleList() + self.condition_shift = nn.ModuleList() + for i in range(3, self.log_size + 1): + out_channels = channels[f"{2**i}"] + if sft_half: + sft_out_channels = out_channels + else: + sft_out_channels = out_channels * 2 + self.condition_scale.append( + nn.Sequential( + EqualConv2d( + out_channels, + out_channels, + 3, + stride=1, + padding=1, + bias=True, + bias_init_val=0, + ), + ScaledLeakyReLU(0.2), + EqualConv2d( + out_channels, + sft_out_channels, + 3, + stride=1, + padding=1, + bias=True, + bias_init_val=1, + ), + ) + ) + self.condition_shift.append( + nn.Sequential( + EqualConv2d( + out_channels, + out_channels, + 3, + stride=1, + padding=1, + bias=True, + bias_init_val=0, + ), + ScaledLeakyReLU(0.2), + EqualConv2d( + out_channels, + sft_out_channels, + 3, + stride=1, + padding=1, + bias=True, + bias_init_val=0, + ), + ) + ) + + def forward( + self, x, return_latents=False, return_rgb=True, randomize_noise=True, **kwargs + ): + """Forward function for GFPGANv1. + Args: + x (Tensor): Input images. + return_latents (bool): Whether to return style latents. Default: False. + return_rgb (bool): Whether return intermediate rgb images. Default: True. + randomize_noise (bool): Randomize noise, used when 'noise' is False. Default: True. + """ + conditions = [] + unet_skips = [] + out_rgbs = [] + + # encoder + feat = self.conv_body_first(x) + for i in range(self.log_size - 2): + feat = self.conv_body_down[i](feat) + unet_skips.insert(0, feat) + + feat = self.final_conv(feat) + + # style code + style_code = self.final_linear(feat.view(feat.size(0), -1)) + if self.different_w: + style_code = style_code.view(style_code.size(0), -1, self.num_style_feat) + + # decode + for i in range(self.log_size - 2): + # add unet skip + feat = feat + unet_skips[i] + # ResUpLayer + feat = self.conv_body_up[i](feat) + # generate scale and shift for SFT layers + scale = self.condition_scale[i](feat) + conditions.append(scale.clone()) + shift = self.condition_shift[i](feat) + conditions.append(shift.clone()) + # generate rgb images + if return_rgb: + out_rgbs.append(self.toRGB[i](feat)) + + # decoder + image, _ = self.stylegan_decoder( + [style_code], + conditions, + return_latents=return_latents, + input_is_latent=self.input_is_latent, + randomize_noise=randomize_noise, + ) + + return image, out_rgbs + + +class FacialComponentDiscriminator(nn.Module): + """Facial component (eyes, mouth, noise) discriminator used in GFPGAN.""" + + def __init__(self): + super(FacialComponentDiscriminator, self).__init__() + # It now uses a VGG-style architectrue with fixed model size + self.conv1 = ConvLayer( + 3, + 64, + 3, + downsample=False, + resample_kernel=(1, 3, 3, 1), + bias=True, + activate=True, + ) + self.conv2 = ConvLayer( + 64, + 128, + 3, + downsample=True, + resample_kernel=(1, 3, 3, 1), + bias=True, + activate=True, + ) + self.conv3 = ConvLayer( + 128, + 128, + 3, + downsample=False, + resample_kernel=(1, 3, 3, 1), + bias=True, + activate=True, + ) + self.conv4 = ConvLayer( + 128, + 256, + 3, + downsample=True, + resample_kernel=(1, 3, 3, 1), + bias=True, + activate=True, + ) + self.conv5 = ConvLayer( + 256, + 256, + 3, + downsample=False, + resample_kernel=(1, 3, 3, 1), + bias=True, + activate=True, + ) + self.final_conv = ConvLayer(256, 1, 3, bias=True, activate=False) + + def forward(self, x, return_feats=False, **kwargs): + """Forward function for FacialComponentDiscriminator. + Args: + x (Tensor): Input images. + return_feats (bool): Whether to return intermediate features. Default: False. + """ + feat = self.conv1(x) + feat = self.conv3(self.conv2(feat)) + rlt_feats = [] + if return_feats: + rlt_feats.append(feat.clone()) + feat = self.conv5(self.conv4(feat)) + if return_feats: + rlt_feats.append(feat.clone()) + out = self.final_conv(feat) + + if return_feats: + return out, rlt_feats + else: + return out, None diff --git a/ldm_patched/pfn/architecture/face/gfpganv1_clean_arch.py b/ldm_patched/pfn/architecture/face/gfpganv1_clean_arch.py new file mode 100644 index 0000000000000000000000000000000000000000..16470d6345f71ed1517ff26f65b9cd125d80d99e --- /dev/null +++ b/ldm_patched/pfn/architecture/face/gfpganv1_clean_arch.py @@ -0,0 +1,370 @@ +# pylint: skip-file +# type: ignore +import math +import random + +import torch +from torch import nn +from torch.nn import functional as F + +from .stylegan2_clean_arch import StyleGAN2GeneratorClean + + +class StyleGAN2GeneratorCSFT(StyleGAN2GeneratorClean): + """StyleGAN2 Generator with SFT modulation (Spatial Feature Transform). + It is the clean version without custom compiled CUDA extensions used in StyleGAN2. + Args: + out_size (int): The spatial size of outputs. + num_style_feat (int): Channel number of style features. Default: 512. + num_mlp (int): Layer number of MLP style layers. Default: 8. + channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2. + narrow (float): The narrow ratio for channels. Default: 1. + sft_half (bool): Whether to apply SFT on half of the input channels. Default: False. + """ + + def __init__( + self, + out_size, + num_style_feat=512, + num_mlp=8, + channel_multiplier=2, + narrow=1, + sft_half=False, + ): + super(StyleGAN2GeneratorCSFT, self).__init__( + out_size, + num_style_feat=num_style_feat, + num_mlp=num_mlp, + channel_multiplier=channel_multiplier, + narrow=narrow, + ) + self.sft_half = sft_half + + def forward( + self, + styles, + conditions, + input_is_latent=False, + noise=None, + randomize_noise=True, + truncation=1, + truncation_latent=None, + inject_index=None, + return_latents=False, + ): + """Forward function for StyleGAN2GeneratorCSFT. + Args: + styles (list[Tensor]): Sample codes of styles. + conditions (list[Tensor]): SFT conditions to generators. + input_is_latent (bool): Whether input is latent style. Default: False. + noise (Tensor | None): Input noise or None. Default: None. + randomize_noise (bool): Randomize noise, used when 'noise' is False. Default: True. + truncation (float): The truncation ratio. Default: 1. + truncation_latent (Tensor | None): The truncation latent tensor. Default: None. + inject_index (int | None): The injection index for mixing noise. Default: None. + return_latents (bool): Whether to return style latents. Default: False. + """ + # style codes -> latents with Style MLP layer + if not input_is_latent: + styles = [self.style_mlp(s) for s in styles] + # noises + if noise is None: + if randomize_noise: + noise = [None] * self.num_layers # for each style conv layer + else: # use the stored noise + noise = [ + getattr(self.noises, f"noise{i}") for i in range(self.num_layers) + ] + # style truncation + if truncation < 1: + style_truncation = [] + for style in styles: + style_truncation.append( + truncation_latent + truncation * (style - truncation_latent) + ) + styles = style_truncation + # get style latents with injection + if len(styles) == 1: + inject_index = self.num_latent + + if styles[0].ndim < 3: + # repeat latent code for all the layers + latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1) + else: # used for encoder with different latent code for each layer + latent = styles[0] + elif len(styles) == 2: # mixing noises + if inject_index is None: + inject_index = random.randint(1, self.num_latent - 1) + latent1 = styles[0].unsqueeze(1).repeat(1, inject_index, 1) + latent2 = ( + styles[1].unsqueeze(1).repeat(1, self.num_latent - inject_index, 1) + ) + latent = torch.cat([latent1, latent2], 1) + + # main generation + out = self.constant_input(latent.shape[0]) + out = self.style_conv1(out, latent[:, 0], noise=noise[0]) + skip = self.to_rgb1(out, latent[:, 1]) + + i = 1 + for conv1, conv2, noise1, noise2, to_rgb in zip( + self.style_convs[::2], + self.style_convs[1::2], + noise[1::2], + noise[2::2], + self.to_rgbs, + ): + out = conv1(out, latent[:, i], noise=noise1) + + # the conditions may have fewer levels + if i < len(conditions): + # SFT part to combine the conditions + if self.sft_half: # only apply SFT to half of the channels + out_same, out_sft = torch.split(out, int(out.size(1) // 2), dim=1) + out_sft = out_sft * conditions[i - 1] + conditions[i] + out = torch.cat([out_same, out_sft], dim=1) + else: # apply SFT to all the channels + out = out * conditions[i - 1] + conditions[i] + + out = conv2(out, latent[:, i + 1], noise=noise2) + skip = to_rgb(out, latent[:, i + 2], skip) # feature back to the rgb space + i += 2 + + image = skip + + if return_latents: + return image, latent + else: + return image, None + + +class ResBlock(nn.Module): + """Residual block with bilinear upsampling/downsampling. + Args: + in_channels (int): Channel number of the input. + out_channels (int): Channel number of the output. + mode (str): Upsampling/downsampling mode. Options: down | up. Default: down. + """ + + def __init__(self, in_channels, out_channels, mode="down"): + super(ResBlock, self).__init__() + + self.conv1 = nn.Conv2d(in_channels, in_channels, 3, 1, 1) + self.conv2 = nn.Conv2d(in_channels, out_channels, 3, 1, 1) + self.skip = nn.Conv2d(in_channels, out_channels, 1, bias=False) + if mode == "down": + self.scale_factor = 0.5 + elif mode == "up": + self.scale_factor = 2 + + def forward(self, x): + out = F.leaky_relu_(self.conv1(x), negative_slope=0.2) + # upsample/downsample + out = F.interpolate( + out, scale_factor=self.scale_factor, mode="bilinear", align_corners=False + ) + out = F.leaky_relu_(self.conv2(out), negative_slope=0.2) + # skip + x = F.interpolate( + x, scale_factor=self.scale_factor, mode="bilinear", align_corners=False + ) + skip = self.skip(x) + out = out + skip + return out + + +class GFPGANv1Clean(nn.Module): + """The GFPGAN architecture: Unet + StyleGAN2 decoder with SFT. + It is the clean version without custom compiled CUDA extensions used in StyleGAN2. + Ref: GFP-GAN: Towards Real-World Blind Face Restoration with Generative Facial Prior. + Args: + out_size (int): The spatial size of outputs. + num_style_feat (int): Channel number of style features. Default: 512. + channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2. + decoder_load_path (str): The path to the pre-trained decoder model (usually, the StyleGAN2). Default: None. + fix_decoder (bool): Whether to fix the decoder. Default: True. + num_mlp (int): Layer number of MLP style layers. Default: 8. + input_is_latent (bool): Whether input is latent style. Default: False. + different_w (bool): Whether to use different latent w for different layers. Default: False. + narrow (float): The narrow ratio for channels. Default: 1. + sft_half (bool): Whether to apply SFT on half of the input channels. Default: False. + """ + + def __init__( + self, + state_dict, + ): + super(GFPGANv1Clean, self).__init__() + + out_size = 512 + num_style_feat = 512 + channel_multiplier = 2 + decoder_load_path = None + fix_decoder = False + num_mlp = 8 + input_is_latent = True + different_w = True + narrow = 1 + sft_half = True + + self.model_arch = "GFPGAN" + self.sub_type = "Face SR" + self.scale = 8 + self.in_nc = 3 + self.out_nc = 3 + self.state = state_dict + + self.supports_fp16 = False + self.supports_bf16 = True + self.min_size_restriction = 512 + + self.input_is_latent = input_is_latent + self.different_w = different_w + self.num_style_feat = num_style_feat + + unet_narrow = narrow * 0.5 # by default, use a half of input channels + channels = { + "4": int(512 * unet_narrow), + "8": int(512 * unet_narrow), + "16": int(512 * unet_narrow), + "32": int(512 * unet_narrow), + "64": int(256 * channel_multiplier * unet_narrow), + "128": int(128 * channel_multiplier * unet_narrow), + "256": int(64 * channel_multiplier * unet_narrow), + "512": int(32 * channel_multiplier * unet_narrow), + "1024": int(16 * channel_multiplier * unet_narrow), + } + + self.log_size = int(math.log(out_size, 2)) + first_out_size = 2 ** (int(math.log(out_size, 2))) + + self.conv_body_first = nn.Conv2d(3, channels[f"{first_out_size}"], 1) + + # downsample + in_channels = channels[f"{first_out_size}"] + self.conv_body_down = nn.ModuleList() + for i in range(self.log_size, 2, -1): + out_channels = channels[f"{2**(i - 1)}"] + self.conv_body_down.append(ResBlock(in_channels, out_channels, mode="down")) + in_channels = out_channels + + self.final_conv = nn.Conv2d(in_channels, channels["4"], 3, 1, 1) + + # upsample + in_channels = channels["4"] + self.conv_body_up = nn.ModuleList() + for i in range(3, self.log_size + 1): + out_channels = channels[f"{2**i}"] + self.conv_body_up.append(ResBlock(in_channels, out_channels, mode="up")) + in_channels = out_channels + + # to RGB + self.toRGB = nn.ModuleList() + for i in range(3, self.log_size + 1): + self.toRGB.append(nn.Conv2d(channels[f"{2**i}"], 3, 1)) + + if different_w: + linear_out_channel = (int(math.log(out_size, 2)) * 2 - 2) * num_style_feat + else: + linear_out_channel = num_style_feat + + self.final_linear = nn.Linear(channels["4"] * 4 * 4, linear_out_channel) + + # the decoder: stylegan2 generator with SFT modulations + self.stylegan_decoder = StyleGAN2GeneratorCSFT( + out_size=out_size, + num_style_feat=num_style_feat, + num_mlp=num_mlp, + channel_multiplier=channel_multiplier, + narrow=narrow, + sft_half=sft_half, + ) + + # load pre-trained stylegan2 model if necessary + if decoder_load_path: + self.stylegan_decoder.load_state_dict( + torch.load( + decoder_load_path, map_location=lambda storage, loc: storage + )["params_ema"] + ) + # fix decoder without updating params + if fix_decoder: + for _, param in self.stylegan_decoder.named_parameters(): + param.requires_grad = False + + # for SFT modulations (scale and shift) + self.condition_scale = nn.ModuleList() + self.condition_shift = nn.ModuleList() + for i in range(3, self.log_size + 1): + out_channels = channels[f"{2**i}"] + if sft_half: + sft_out_channels = out_channels + else: + sft_out_channels = out_channels * 2 + self.condition_scale.append( + nn.Sequential( + nn.Conv2d(out_channels, out_channels, 3, 1, 1), + nn.LeakyReLU(0.2, True), + nn.Conv2d(out_channels, sft_out_channels, 3, 1, 1), + ) + ) + self.condition_shift.append( + nn.Sequential( + nn.Conv2d(out_channels, out_channels, 3, 1, 1), + nn.LeakyReLU(0.2, True), + nn.Conv2d(out_channels, sft_out_channels, 3, 1, 1), + ) + ) + self.load_state_dict(state_dict) + + def forward( + self, x, return_latents=False, return_rgb=True, randomize_noise=True, **kwargs + ): + """Forward function for GFPGANv1Clean. + Args: + x (Tensor): Input images. + return_latents (bool): Whether to return style latents. Default: False. + return_rgb (bool): Whether return intermediate rgb images. Default: True. + randomize_noise (bool): Randomize noise, used when 'noise' is False. Default: True. + """ + conditions = [] + unet_skips = [] + out_rgbs = [] + + # encoder + feat = F.leaky_relu_(self.conv_body_first(x), negative_slope=0.2) + for i in range(self.log_size - 2): + feat = self.conv_body_down[i](feat) + unet_skips.insert(0, feat) + feat = F.leaky_relu_(self.final_conv(feat), negative_slope=0.2) + + # style code + style_code = self.final_linear(feat.view(feat.size(0), -1)) + if self.different_w: + style_code = style_code.view(style_code.size(0), -1, self.num_style_feat) + + # decode + for i in range(self.log_size - 2): + # add unet skip + feat = feat + unet_skips[i] + # ResUpLayer + feat = self.conv_body_up[i](feat) + # generate scale and shift for SFT layers + scale = self.condition_scale[i](feat) + conditions.append(scale.clone()) + shift = self.condition_shift[i](feat) + conditions.append(shift.clone()) + # generate rgb images + if return_rgb: + out_rgbs.append(self.toRGB[i](feat)) + + # decoder + image, _ = self.stylegan_decoder( + [style_code], + conditions, + return_latents=return_latents, + input_is_latent=self.input_is_latent, + randomize_noise=randomize_noise, + ) + + return image, out_rgbs diff --git a/ldm_patched/pfn/architecture/face/restoreformer_arch.py b/ldm_patched/pfn/architecture/face/restoreformer_arch.py new file mode 100644 index 0000000000000000000000000000000000000000..4492260291d6d74b2c0d38130f7aa8b50ba2fc11 --- /dev/null +++ b/ldm_patched/pfn/architecture/face/restoreformer_arch.py @@ -0,0 +1,776 @@ +# pylint: skip-file +# type: ignore +"""Modified from https://github.com/wzhouxiff/RestoreFormer +""" +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class VectorQuantizer(nn.Module): + """ + see https://github.com/MishaLaskin/vqvae/blob/d761a999e2267766400dc646d82d3ac3657771d4/models/quantizer.py + ____________________________________________ + Discretization bottleneck part of the VQ-VAE. + Inputs: + - n_e : number of embeddings + - e_dim : dimension of embedding + - beta : commitment cost used in loss term, beta * ||z_e(x)-sg[e]||^2 + _____________________________________________ + """ + + def __init__(self, n_e, e_dim, beta): + super(VectorQuantizer, self).__init__() + self.n_e = n_e + self.e_dim = e_dim + self.beta = beta + + self.embedding = nn.Embedding(self.n_e, self.e_dim) + self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e) + + def forward(self, z): + """ + Inputs the output of the encoder network z and maps it to a discrete + one-hot vector that is the index of the closest embedding vector e_j + z (continuous) -> z_q (discrete) + z.shape = (batch, channel, height, width) + quantization pipeline: + 1. get encoder input (B,C,H,W) + 2. flatten input to (B*H*W,C) + """ + # reshape z -> (batch, height, width, channel) and flatten + z = z.permute(0, 2, 3, 1).contiguous() + z_flattened = z.view(-1, self.e_dim) + # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z + + d = ( + torch.sum(z_flattened**2, dim=1, keepdim=True) + + torch.sum(self.embedding.weight**2, dim=1) + - 2 * torch.matmul(z_flattened, self.embedding.weight.t()) + ) + + # could possible replace this here + # #\start... + # find closest encodings + + min_value, min_encoding_indices = torch.min(d, dim=1) + + min_encoding_indices = min_encoding_indices.unsqueeze(1) + + min_encodings = torch.zeros(min_encoding_indices.shape[0], self.n_e).to(z) + min_encodings.scatter_(1, min_encoding_indices, 1) + + # dtype min encodings: torch.float32 + # min_encodings shape: torch.Size([2048, 512]) + # min_encoding_indices.shape: torch.Size([2048, 1]) + + # get quantized latent vectors + z_q = torch.matmul(min_encodings, self.embedding.weight).view(z.shape) + # .........\end + + # with: + # .........\start + # min_encoding_indices = torch.argmin(d, dim=1) + # z_q = self.embedding(min_encoding_indices) + # ......\end......... (TODO) + + # compute loss for embedding + loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean( + (z_q - z.detach()) ** 2 + ) + + # preserve gradients + z_q = z + (z_q - z).detach() + + # perplexity + + e_mean = torch.mean(min_encodings, dim=0) + perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + 1e-10))) + + # reshape back to match original input shape + z_q = z_q.permute(0, 3, 1, 2).contiguous() + + return z_q, loss, (perplexity, min_encodings, min_encoding_indices, d) + + def get_codebook_entry(self, indices, shape): + # shape specifying (batch, height, width, channel) + # TODO: check for more easy handling with nn.Embedding + min_encodings = torch.zeros(indices.shape[0], self.n_e).to(indices) + min_encodings.scatter_(1, indices[:, None], 1) + + # get quantized latent vectors + z_q = torch.matmul(min_encodings.float(), self.embedding.weight) + + if shape is not None: + z_q = z_q.view(shape) + + # reshape back to match original input shape + z_q = z_q.permute(0, 3, 1, 2).contiguous() + + return z_q + + +# pytorch_diffusion + derived encoder decoder +def nonlinearity(x): + # swish + return x * torch.sigmoid(x) + + +def Normalize(in_channels): + return torch.nn.GroupNorm( + num_groups=32, num_channels=in_channels, eps=1e-6, affine=True + ) + + +class Upsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=3, stride=1, padding=1 + ) + + def forward(self, x): + x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest") + if self.with_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + if self.with_conv: + # no asymmetric padding in torch conv, must do it ourselves + self.conv = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=3, stride=2, padding=0 + ) + + def forward(self, x): + if self.with_conv: + pad = (0, 1, 0, 1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x = self.conv(x) + else: + x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) + return x + + +class ResnetBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512 + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + + self.norm1 = Normalize(in_channels) + self.conv1 = torch.nn.Conv2d( + in_channels, out_channels, kernel_size=3, stride=1, padding=1 + ) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = torch.nn.Conv2d( + out_channels, out_channels, kernel_size=3, stride=1, padding=1 + ) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d( + in_channels, out_channels, kernel_size=3, stride=1, padding=1 + ) + else: + self.nin_shortcut = torch.nn.Conv2d( + in_channels, out_channels, kernel_size=1, stride=1, padding=0 + ) + + def forward(self, x, temb): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class MultiHeadAttnBlock(nn.Module): + def __init__(self, in_channels, head_size=1): + super().__init__() + self.in_channels = in_channels + self.head_size = head_size + self.att_size = in_channels // head_size + assert ( + in_channels % head_size == 0 + ), "The size of head should be divided by the number of channels." + + self.norm1 = Normalize(in_channels) + self.norm2 = Normalize(in_channels) + + self.q = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.k = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.v = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.proj_out = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.num = 0 + + def forward(self, x, y=None): + h_ = x + h_ = self.norm1(h_) + if y is None: + y = h_ + else: + y = self.norm2(y) + + q = self.q(y) + k = self.k(h_) + v = self.v(h_) + + # compute attention + b, c, h, w = q.shape + q = q.reshape(b, self.head_size, self.att_size, h * w) + q = q.permute(0, 3, 1, 2) # b, hw, head, att + + k = k.reshape(b, self.head_size, self.att_size, h * w) + k = k.permute(0, 3, 1, 2) + + v = v.reshape(b, self.head_size, self.att_size, h * w) + v = v.permute(0, 3, 1, 2) + + q = q.transpose(1, 2) + v = v.transpose(1, 2) + k = k.transpose(1, 2).transpose(2, 3) + + scale = int(self.att_size) ** (-0.5) + q.mul_(scale) + w_ = torch.matmul(q, k) + w_ = F.softmax(w_, dim=3) + + w_ = w_.matmul(v) + + w_ = w_.transpose(1, 2).contiguous() # [b, h*w, head, att] + w_ = w_.view(b, h, w, -1) + w_ = w_.permute(0, 3, 1, 2) + + w_ = self.proj_out(w_) + + return x + w_ + + +class MultiHeadEncoder(nn.Module): + def __init__( + self, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + num_res_blocks=2, + attn_resolutions=(16,), + dropout=0.0, + resamp_with_conv=True, + in_channels=3, + resolution=512, + z_channels=256, + double_z=True, + enable_mid=True, + head_size=1, + **ignore_kwargs + ): + super().__init__() + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + self.enable_mid = enable_mid + + # downsampling + self.conv_in = torch.nn.Conv2d( + in_channels, self.ch, kernel_size=3, stride=1, padding=1 + ) + + curr_res = resolution + in_ch_mult = (1,) + tuple(ch_mult) + self.down = nn.ModuleList() + for i_level in range(self.num_resolutions): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + ) + ) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(MultiHeadAttnBlock(block_in, head_size)) + down = nn.Module() + down.block = block + down.attn = attn + if i_level != self.num_resolutions - 1: + down.downsample = Downsample(block_in, resamp_with_conv) + curr_res = curr_res // 2 + self.down.append(down) + + # middle + if self.enable_mid: + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + ) + self.mid.attn_1 = MultiHeadAttnBlock(block_in, head_size) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + ) + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, + 2 * z_channels if double_z else z_channels, + kernel_size=3, + stride=1, + padding=1, + ) + + def forward(self, x): + hs = {} + # timestep embedding + temb = None + + # downsampling + h = self.conv_in(x) + hs["in"] = h + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = self.down[i_level].block[i_block](h, temb) + if len(self.down[i_level].attn) > 0: + h = self.down[i_level].attn[i_block](h) + + if i_level != self.num_resolutions - 1: + # hs.append(h) + hs["block_" + str(i_level)] = h + h = self.down[i_level].downsample(h) + + # middle + # h = hs[-1] + if self.enable_mid: + h = self.mid.block_1(h, temb) + hs["block_" + str(i_level) + "_atten"] = h + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + hs["mid_atten"] = h + + # end + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + # hs.append(h) + hs["out"] = h + + return hs + + +class MultiHeadDecoder(nn.Module): + def __init__( + self, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + num_res_blocks=2, + attn_resolutions=(16,), + dropout=0.0, + resamp_with_conv=True, + in_channels=3, + resolution=512, + z_channels=256, + give_pre_end=False, + enable_mid=True, + head_size=1, + **ignorekwargs + ): + super().__init__() + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.enable_mid = enable_mid + + # compute in_ch_mult, block_in and curr_res at lowest res + block_in = ch * ch_mult[self.num_resolutions - 1] + curr_res = resolution // 2 ** (self.num_resolutions - 1) + self.z_shape = (1, z_channels, curr_res, curr_res) + print( + "Working with z of shape {} = {} dimensions.".format( + self.z_shape, np.prod(self.z_shape) + ) + ) + + # z to block_in + self.conv_in = torch.nn.Conv2d( + z_channels, block_in, kernel_size=3, stride=1, padding=1 + ) + + # middle + if self.enable_mid: + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + ) + self.mid.attn_1 = MultiHeadAttnBlock(block_in, head_size) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + ) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + ) + ) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(MultiHeadAttnBlock(block_in, head_size)) + up = nn.Module() + up.block = block + up.attn = attn + if i_level != 0: + up.upsample = Upsample(block_in, resamp_with_conv) + curr_res = curr_res * 2 + self.up.insert(0, up) # prepend to get consistent order + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, out_ch, kernel_size=3, stride=1, padding=1 + ) + + def forward(self, z): + # assert z.shape[1:] == self.z_shape[1:] + self.last_z_shape = z.shape + + # timestep embedding + temb = None + + # z to block_in + h = self.conv_in(z) + + # middle + if self.enable_mid: + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # upsampling + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = self.up[i_level].block[i_block](h, temb) + if len(self.up[i_level].attn) > 0: + h = self.up[i_level].attn[i_block](h) + if i_level != 0: + h = self.up[i_level].upsample(h) + + # end + if self.give_pre_end: + return h + + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + +class MultiHeadDecoderTransformer(nn.Module): + def __init__( + self, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + num_res_blocks=2, + attn_resolutions=(16,), + dropout=0.0, + resamp_with_conv=True, + in_channels=3, + resolution=512, + z_channels=256, + give_pre_end=False, + enable_mid=True, + head_size=1, + **ignorekwargs + ): + super().__init__() + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.enable_mid = enable_mid + + # compute in_ch_mult, block_in and curr_res at lowest res + block_in = ch * ch_mult[self.num_resolutions - 1] + curr_res = resolution // 2 ** (self.num_resolutions - 1) + self.z_shape = (1, z_channels, curr_res, curr_res) + print( + "Working with z of shape {} = {} dimensions.".format( + self.z_shape, np.prod(self.z_shape) + ) + ) + + # z to block_in + self.conv_in = torch.nn.Conv2d( + z_channels, block_in, kernel_size=3, stride=1, padding=1 + ) + + # middle + if self.enable_mid: + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + ) + self.mid.attn_1 = MultiHeadAttnBlock(block_in, head_size) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + ) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + ) + ) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(MultiHeadAttnBlock(block_in, head_size)) + up = nn.Module() + up.block = block + up.attn = attn + if i_level != 0: + up.upsample = Upsample(block_in, resamp_with_conv) + curr_res = curr_res * 2 + self.up.insert(0, up) # prepend to get consistent order + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, out_ch, kernel_size=3, stride=1, padding=1 + ) + + def forward(self, z, hs): + # assert z.shape[1:] == self.z_shape[1:] + # self.last_z_shape = z.shape + + # timestep embedding + temb = None + + # z to block_in + h = self.conv_in(z) + + # middle + if self.enable_mid: + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h, hs["mid_atten"]) + h = self.mid.block_2(h, temb) + + # upsampling + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = self.up[i_level].block[i_block](h, temb) + if len(self.up[i_level].attn) > 0: + h = self.up[i_level].attn[i_block]( + h, hs["block_" + str(i_level) + "_atten"] + ) + # hfeature = h.clone() + if i_level != 0: + h = self.up[i_level].upsample(h) + + # end + if self.give_pre_end: + return h + + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + +class RestoreFormer(nn.Module): + def __init__( + self, + state_dict, + ): + super(RestoreFormer, self).__init__() + + n_embed = 1024 + embed_dim = 256 + ch = 64 + out_ch = 3 + ch_mult = (1, 2, 2, 4, 4, 8) + num_res_blocks = 2 + attn_resolutions = (16,) + dropout = 0.0 + in_channels = 3 + resolution = 512 + z_channels = 256 + double_z = False + enable_mid = True + fix_decoder = False + fix_codebook = True + fix_encoder = False + head_size = 8 + + self.model_arch = "RestoreFormer" + self.sub_type = "Face SR" + self.scale = 8 + self.in_nc = 3 + self.out_nc = out_ch + self.state = state_dict + + self.supports_fp16 = False + self.supports_bf16 = True + self.min_size_restriction = 16 + + self.encoder = MultiHeadEncoder( + ch=ch, + out_ch=out_ch, + ch_mult=ch_mult, + num_res_blocks=num_res_blocks, + attn_resolutions=attn_resolutions, + dropout=dropout, + in_channels=in_channels, + resolution=resolution, + z_channels=z_channels, + double_z=double_z, + enable_mid=enable_mid, + head_size=head_size, + ) + self.decoder = MultiHeadDecoderTransformer( + ch=ch, + out_ch=out_ch, + ch_mult=ch_mult, + num_res_blocks=num_res_blocks, + attn_resolutions=attn_resolutions, + dropout=dropout, + in_channels=in_channels, + resolution=resolution, + z_channels=z_channels, + enable_mid=enable_mid, + head_size=head_size, + ) + + self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25) + + self.quant_conv = torch.nn.Conv2d(z_channels, embed_dim, 1) + self.post_quant_conv = torch.nn.Conv2d(embed_dim, z_channels, 1) + + if fix_decoder: + for _, param in self.decoder.named_parameters(): + param.requires_grad = False + for _, param in self.post_quant_conv.named_parameters(): + param.requires_grad = False + for _, param in self.quantize.named_parameters(): + param.requires_grad = False + elif fix_codebook: + for _, param in self.quantize.named_parameters(): + param.requires_grad = False + + if fix_encoder: + for _, param in self.encoder.named_parameters(): + param.requires_grad = False + + self.load_state_dict(state_dict) + + def encode(self, x): + hs = self.encoder(x) + h = self.quant_conv(hs["out"]) + quant, emb_loss, info = self.quantize(h) + return quant, emb_loss, info, hs + + def decode(self, quant, hs): + quant = self.post_quant_conv(quant) + dec = self.decoder(quant, hs) + + return dec + + def forward(self, input, **kwargs): + quant, diff, info, hs = self.encode(input) + dec = self.decode(quant, hs) + + return dec, None diff --git a/ldm_patched/pfn/architecture/face/stylegan2_arch.py b/ldm_patched/pfn/architecture/face/stylegan2_arch.py new file mode 100644 index 0000000000000000000000000000000000000000..1eb0e9f15f706e2b9759bde4d0244d424c3ae76f --- /dev/null +++ b/ldm_patched/pfn/architecture/face/stylegan2_arch.py @@ -0,0 +1,865 @@ +# pylint: skip-file +# type: ignore +import math +import random + +import torch +from torch import nn +from torch.nn import functional as F + +from .fused_act import FusedLeakyReLU, fused_leaky_relu +from .upfirdn2d import upfirdn2d + + +class NormStyleCode(nn.Module): + def forward(self, x): + """Normalize the style codes. + + Args: + x (Tensor): Style codes with shape (b, c). + + Returns: + Tensor: Normalized tensor. + """ + return x * torch.rsqrt(torch.mean(x**2, dim=1, keepdim=True) + 1e-8) + + +def make_resample_kernel(k): + """Make resampling kernel for UpFirDn. + + Args: + k (list[int]): A list indicating the 1D resample kernel magnitude. + + Returns: + Tensor: 2D resampled kernel. + """ + k = torch.tensor(k, dtype=torch.float32) + if k.ndim == 1: + k = k[None, :] * k[:, None] # to 2D kernel, outer product + # normalize + k /= k.sum() + return k + + +class UpFirDnUpsample(nn.Module): + """Upsample, FIR filter, and downsample (upsampole version). + + References: + 1. https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.upfirdn.html # noqa: E501 + 2. http://www.ece.northwestern.edu/local-apps/matlabhelp/toolbox/signal/upfirdn.html # noqa: E501 + + Args: + resample_kernel (list[int]): A list indicating the 1D resample kernel + magnitude. + factor (int): Upsampling scale factor. Default: 2. + """ + + def __init__(self, resample_kernel, factor=2): + super(UpFirDnUpsample, self).__init__() + self.kernel = make_resample_kernel(resample_kernel) * (factor**2) + self.factor = factor + + pad = self.kernel.shape[0] - factor + self.pad = ((pad + 1) // 2 + factor - 1, pad // 2) + + def forward(self, x): + out = upfirdn2d(x, self.kernel.type_as(x), up=self.factor, down=1, pad=self.pad) + return out + + def __repr__(self): + return f"{self.__class__.__name__}(factor={self.factor})" + + +class UpFirDnDownsample(nn.Module): + """Upsample, FIR filter, and downsample (downsampole version). + + Args: + resample_kernel (list[int]): A list indicating the 1D resample kernel + magnitude. + factor (int): Downsampling scale factor. Default: 2. + """ + + def __init__(self, resample_kernel, factor=2): + super(UpFirDnDownsample, self).__init__() + self.kernel = make_resample_kernel(resample_kernel) + self.factor = factor + + pad = self.kernel.shape[0] - factor + self.pad = ((pad + 1) // 2, pad // 2) + + def forward(self, x): + out = upfirdn2d(x, self.kernel.type_as(x), up=1, down=self.factor, pad=self.pad) + return out + + def __repr__(self): + return f"{self.__class__.__name__}(factor={self.factor})" + + +class UpFirDnSmooth(nn.Module): + """Upsample, FIR filter, and downsample (smooth version). + + Args: + resample_kernel (list[int]): A list indicating the 1D resample kernel + magnitude. + upsample_factor (int): Upsampling scale factor. Default: 1. + downsample_factor (int): Downsampling scale factor. Default: 1. + kernel_size (int): Kernel size: Default: 1. + """ + + def __init__( + self, resample_kernel, upsample_factor=1, downsample_factor=1, kernel_size=1 + ): + super(UpFirDnSmooth, self).__init__() + self.upsample_factor = upsample_factor + self.downsample_factor = downsample_factor + self.kernel = make_resample_kernel(resample_kernel) + if upsample_factor > 1: + self.kernel = self.kernel * (upsample_factor**2) + + if upsample_factor > 1: + pad = (self.kernel.shape[0] - upsample_factor) - (kernel_size - 1) + self.pad = ((pad + 1) // 2 + upsample_factor - 1, pad // 2 + 1) + elif downsample_factor > 1: + pad = (self.kernel.shape[0] - downsample_factor) + (kernel_size - 1) + self.pad = ((pad + 1) // 2, pad // 2) + else: + raise NotImplementedError + + def forward(self, x): + out = upfirdn2d(x, self.kernel.type_as(x), up=1, down=1, pad=self.pad) + return out + + def __repr__(self): + return ( + f"{self.__class__.__name__}(upsample_factor={self.upsample_factor}" + f", downsample_factor={self.downsample_factor})" + ) + + +class EqualLinear(nn.Module): + """Equalized Linear as StyleGAN2. + + Args: + in_channels (int): Size of each sample. + out_channels (int): Size of each output sample. + bias (bool): If set to ``False``, the layer will not learn an additive + bias. Default: ``True``. + bias_init_val (float): Bias initialized value. Default: 0. + lr_mul (float): Learning rate multiplier. Default: 1. + activation (None | str): The activation after ``linear`` operation. + Supported: 'fused_lrelu', None. Default: None. + """ + + def __init__( + self, + in_channels, + out_channels, + bias=True, + bias_init_val=0, + lr_mul=1, + activation=None, + ): + super(EqualLinear, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.lr_mul = lr_mul + self.activation = activation + if self.activation not in ["fused_lrelu", None]: + raise ValueError( + f"Wrong activation value in EqualLinear: {activation}" + "Supported ones are: ['fused_lrelu', None]." + ) + self.scale = (1 / math.sqrt(in_channels)) * lr_mul + + self.weight = nn.Parameter(torch.randn(out_channels, in_channels).div_(lr_mul)) + if bias: + self.bias = nn.Parameter(torch.zeros(out_channels).fill_(bias_init_val)) + else: + self.register_parameter("bias", None) + + def forward(self, x): + if self.bias is None: + bias = None + else: + bias = self.bias * self.lr_mul + if self.activation == "fused_lrelu": + out = F.linear(x, self.weight * self.scale) + out = fused_leaky_relu(out, bias) + else: + out = F.linear(x, self.weight * self.scale, bias=bias) + return out + + def __repr__(self): + return ( + f"{self.__class__.__name__}(in_channels={self.in_channels}, " + f"out_channels={self.out_channels}, bias={self.bias is not None})" + ) + + +class ModulatedConv2d(nn.Module): + """Modulated Conv2d used in StyleGAN2. + + There is no bias in ModulatedConv2d. + + Args: + in_channels (int): Channel number of the input. + out_channels (int): Channel number of the output. + kernel_size (int): Size of the convolving kernel. + num_style_feat (int): Channel number of style features. + demodulate (bool): Whether to demodulate in the conv layer. + Default: True. + sample_mode (str | None): Indicating 'upsample', 'downsample' or None. + Default: None. + resample_kernel (list[int]): A list indicating the 1D resample kernel + magnitude. Default: (1, 3, 3, 1). + eps (float): A value added to the denominator for numerical stability. + Default: 1e-8. + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + num_style_feat, + demodulate=True, + sample_mode=None, + resample_kernel=(1, 3, 3, 1), + eps=1e-8, + ): + super(ModulatedConv2d, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.demodulate = demodulate + self.sample_mode = sample_mode + self.eps = eps + + if self.sample_mode == "upsample": + self.smooth = UpFirDnSmooth( + resample_kernel, + upsample_factor=2, + downsample_factor=1, + kernel_size=kernel_size, + ) + elif self.sample_mode == "downsample": + self.smooth = UpFirDnSmooth( + resample_kernel, + upsample_factor=1, + downsample_factor=2, + kernel_size=kernel_size, + ) + elif self.sample_mode is None: + pass + else: + raise ValueError( + f"Wrong sample mode {self.sample_mode}, " + "supported ones are ['upsample', 'downsample', None]." + ) + + self.scale = 1 / math.sqrt(in_channels * kernel_size**2) + # modulation inside each modulated conv + self.modulation = EqualLinear( + num_style_feat, + in_channels, + bias=True, + bias_init_val=1, + lr_mul=1, + activation=None, + ) + + self.weight = nn.Parameter( + torch.randn(1, out_channels, in_channels, kernel_size, kernel_size) + ) + self.padding = kernel_size // 2 + + def forward(self, x, style): + """Forward function. + + Args: + x (Tensor): Tensor with shape (b, c, h, w). + style (Tensor): Tensor with shape (b, num_style_feat). + + Returns: + Tensor: Modulated tensor after convolution. + """ + b, c, h, w = x.shape # c = c_in + # weight modulation + style = self.modulation(style).view(b, 1, c, 1, 1) + # self.weight: (1, c_out, c_in, k, k); style: (b, 1, c, 1, 1) + weight = self.scale * self.weight * style # (b, c_out, c_in, k, k) + + if self.demodulate: + demod = torch.rsqrt(weight.pow(2).sum([2, 3, 4]) + self.eps) + weight = weight * demod.view(b, self.out_channels, 1, 1, 1) + + weight = weight.view( + b * self.out_channels, c, self.kernel_size, self.kernel_size + ) + + if self.sample_mode == "upsample": + x = x.view(1, b * c, h, w) + weight = weight.view( + b, self.out_channels, c, self.kernel_size, self.kernel_size + ) + weight = weight.transpose(1, 2).reshape( + b * c, self.out_channels, self.kernel_size, self.kernel_size + ) + out = F.conv_transpose2d(x, weight, padding=0, stride=2, groups=b) + out = out.view(b, self.out_channels, *out.shape[2:4]) + out = self.smooth(out) + elif self.sample_mode == "downsample": + x = self.smooth(x) + x = x.view(1, b * c, *x.shape[2:4]) + out = F.conv2d(x, weight, padding=0, stride=2, groups=b) + out = out.view(b, self.out_channels, *out.shape[2:4]) + else: + x = x.view(1, b * c, h, w) + # weight: (b*c_out, c_in, k, k), groups=b + out = F.conv2d(x, weight, padding=self.padding, groups=b) + out = out.view(b, self.out_channels, *out.shape[2:4]) + + return out + + def __repr__(self): + return ( + f"{self.__class__.__name__}(in_channels={self.in_channels}, " + f"out_channels={self.out_channels}, " + f"kernel_size={self.kernel_size}, " + f"demodulate={self.demodulate}, sample_mode={self.sample_mode})" + ) + + +class StyleConv(nn.Module): + """Style conv. + + Args: + in_channels (int): Channel number of the input. + out_channels (int): Channel number of the output. + kernel_size (int): Size of the convolving kernel. + num_style_feat (int): Channel number of style features. + demodulate (bool): Whether demodulate in the conv layer. Default: True. + sample_mode (str | None): Indicating 'upsample', 'downsample' or None. + Default: None. + resample_kernel (list[int]): A list indicating the 1D resample kernel + magnitude. Default: (1, 3, 3, 1). + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + num_style_feat, + demodulate=True, + sample_mode=None, + resample_kernel=(1, 3, 3, 1), + ): + super(StyleConv, self).__init__() + self.modulated_conv = ModulatedConv2d( + in_channels, + out_channels, + kernel_size, + num_style_feat, + demodulate=demodulate, + sample_mode=sample_mode, + resample_kernel=resample_kernel, + ) + self.weight = nn.Parameter(torch.zeros(1)) # for noise injection + self.activate = FusedLeakyReLU(out_channels) + + def forward(self, x, style, noise=None): + # modulate + out = self.modulated_conv(x, style) + # noise injection + if noise is None: + b, _, h, w = out.shape + noise = out.new_empty(b, 1, h, w).normal_() + out = out + self.weight * noise + # activation (with bias) + out = self.activate(out) + return out + + +class ToRGB(nn.Module): + """To RGB from features. + + Args: + in_channels (int): Channel number of input. + num_style_feat (int): Channel number of style features. + upsample (bool): Whether to upsample. Default: True. + resample_kernel (list[int]): A list indicating the 1D resample kernel + magnitude. Default: (1, 3, 3, 1). + """ + + def __init__( + self, in_channels, num_style_feat, upsample=True, resample_kernel=(1, 3, 3, 1) + ): + super(ToRGB, self).__init__() + if upsample: + self.upsample = UpFirDnUpsample(resample_kernel, factor=2) + else: + self.upsample = None + self.modulated_conv = ModulatedConv2d( + in_channels, + 3, + kernel_size=1, + num_style_feat=num_style_feat, + demodulate=False, + sample_mode=None, + ) + self.bias = nn.Parameter(torch.zeros(1, 3, 1, 1)) + + def forward(self, x, style, skip=None): + """Forward function. + + Args: + x (Tensor): Feature tensor with shape (b, c, h, w). + style (Tensor): Tensor with shape (b, num_style_feat). + skip (Tensor): Base/skip tensor. Default: None. + + Returns: + Tensor: RGB images. + """ + out = self.modulated_conv(x, style) + out = out + self.bias + if skip is not None: + if self.upsample: + skip = self.upsample(skip) + out = out + skip + return out + + +class ConstantInput(nn.Module): + """Constant input. + + Args: + num_channel (int): Channel number of constant input. + size (int): Spatial size of constant input. + """ + + def __init__(self, num_channel, size): + super(ConstantInput, self).__init__() + self.weight = nn.Parameter(torch.randn(1, num_channel, size, size)) + + def forward(self, batch): + out = self.weight.repeat(batch, 1, 1, 1) + return out + + +class StyleGAN2Generator(nn.Module): + """StyleGAN2 Generator. + + Args: + out_size (int): The spatial size of outputs. + num_style_feat (int): Channel number of style features. Default: 512. + num_mlp (int): Layer number of MLP style layers. Default: 8. + channel_multiplier (int): Channel multiplier for large networks of + StyleGAN2. Default: 2. + resample_kernel (list[int]): A list indicating the 1D resample kernel + magnitude. A cross production will be applied to extent 1D resample + kernel to 2D resample kernel. Default: (1, 3, 3, 1). + lr_mlp (float): Learning rate multiplier for mlp layers. Default: 0.01. + narrow (float): Narrow ratio for channels. Default: 1.0. + """ + + def __init__( + self, + out_size, + num_style_feat=512, + num_mlp=8, + channel_multiplier=2, + resample_kernel=(1, 3, 3, 1), + lr_mlp=0.01, + narrow=1, + ): + super(StyleGAN2Generator, self).__init__() + # Style MLP layers + self.num_style_feat = num_style_feat + style_mlp_layers = [NormStyleCode()] + for i in range(num_mlp): + style_mlp_layers.append( + EqualLinear( + num_style_feat, + num_style_feat, + bias=True, + bias_init_val=0, + lr_mul=lr_mlp, + activation="fused_lrelu", + ) + ) + self.style_mlp = nn.Sequential(*style_mlp_layers) + + channels = { + "4": int(512 * narrow), + "8": int(512 * narrow), + "16": int(512 * narrow), + "32": int(512 * narrow), + "64": int(256 * channel_multiplier * narrow), + "128": int(128 * channel_multiplier * narrow), + "256": int(64 * channel_multiplier * narrow), + "512": int(32 * channel_multiplier * narrow), + "1024": int(16 * channel_multiplier * narrow), + } + self.channels = channels + + self.constant_input = ConstantInput(channels["4"], size=4) + self.style_conv1 = StyleConv( + channels["4"], + channels["4"], + kernel_size=3, + num_style_feat=num_style_feat, + demodulate=True, + sample_mode=None, + resample_kernel=resample_kernel, + ) + self.to_rgb1 = ToRGB( + channels["4"], + num_style_feat, + upsample=False, + resample_kernel=resample_kernel, + ) + + self.log_size = int(math.log(out_size, 2)) + self.num_layers = (self.log_size - 2) * 2 + 1 + self.num_latent = self.log_size * 2 - 2 + + self.style_convs = nn.ModuleList() + self.to_rgbs = nn.ModuleList() + self.noises = nn.Module() + + in_channels = channels["4"] + # noise + for layer_idx in range(self.num_layers): + resolution = 2 ** ((layer_idx + 5) // 2) + shape = [1, 1, resolution, resolution] + self.noises.register_buffer(f"noise{layer_idx}", torch.randn(*shape)) + # style convs and to_rgbs + for i in range(3, self.log_size + 1): + out_channels = channels[f"{2**i}"] + self.style_convs.append( + StyleConv( + in_channels, + out_channels, + kernel_size=3, + num_style_feat=num_style_feat, + demodulate=True, + sample_mode="upsample", + resample_kernel=resample_kernel, + ) + ) + self.style_convs.append( + StyleConv( + out_channels, + out_channels, + kernel_size=3, + num_style_feat=num_style_feat, + demodulate=True, + sample_mode=None, + resample_kernel=resample_kernel, + ) + ) + self.to_rgbs.append( + ToRGB( + out_channels, + num_style_feat, + upsample=True, + resample_kernel=resample_kernel, + ) + ) + in_channels = out_channels + + def make_noise(self): + """Make noise for noise injection.""" + device = self.constant_input.weight.device + noises = [torch.randn(1, 1, 4, 4, device=device)] + + for i in range(3, self.log_size + 1): + for _ in range(2): + noises.append(torch.randn(1, 1, 2**i, 2**i, device=device)) + + return noises + + def get_latent(self, x): + return self.style_mlp(x) + + def mean_latent(self, num_latent): + latent_in = torch.randn( + num_latent, self.num_style_feat, device=self.constant_input.weight.device + ) + latent = self.style_mlp(latent_in).mean(0, keepdim=True) + return latent + + def forward( + self, + styles, + input_is_latent=False, + noise=None, + randomize_noise=True, + truncation=1, + truncation_latent=None, + inject_index=None, + return_latents=False, + ): + """Forward function for StyleGAN2Generator. + + Args: + styles (list[Tensor]): Sample codes of styles. + input_is_latent (bool): Whether input is latent style. + Default: False. + noise (Tensor | None): Input noise or None. Default: None. + randomize_noise (bool): Randomize noise, used when 'noise' is + False. Default: True. + truncation (float): TODO. Default: 1. + truncation_latent (Tensor | None): TODO. Default: None. + inject_index (int | None): The injection index for mixing noise. + Default: None. + return_latents (bool): Whether to return style latents. + Default: False. + """ + # style codes -> latents with Style MLP layer + if not input_is_latent: + styles = [self.style_mlp(s) for s in styles] + # noises + if noise is None: + if randomize_noise: + noise = [None] * self.num_layers # for each style conv layer + else: # use the stored noise + noise = [ + getattr(self.noises, f"noise{i}") for i in range(self.num_layers) + ] + # style truncation + if truncation < 1: + style_truncation = [] + for style in styles: + style_truncation.append( + truncation_latent + truncation * (style - truncation_latent) + ) + styles = style_truncation + # get style latent with injection + if len(styles) == 1: + inject_index = self.num_latent + + if styles[0].ndim < 3: + # repeat latent code for all the layers + latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1) + else: # used for encoder with different latent code for each layer + latent = styles[0] + elif len(styles) == 2: # mixing noises + if inject_index is None: + inject_index = random.randint(1, self.num_latent - 1) + latent1 = styles[0].unsqueeze(1).repeat(1, inject_index, 1) + latent2 = ( + styles[1].unsqueeze(1).repeat(1, self.num_latent - inject_index, 1) + ) + latent = torch.cat([latent1, latent2], 1) + + # main generation + out = self.constant_input(latent.shape[0]) + out = self.style_conv1(out, latent[:, 0], noise=noise[0]) + skip = self.to_rgb1(out, latent[:, 1]) + + i = 1 + for conv1, conv2, noise1, noise2, to_rgb in zip( + self.style_convs[::2], + self.style_convs[1::2], + noise[1::2], + noise[2::2], + self.to_rgbs, + ): + out = conv1(out, latent[:, i], noise=noise1) + out = conv2(out, latent[:, i + 1], noise=noise2) + skip = to_rgb(out, latent[:, i + 2], skip) + i += 2 + + image = skip + + if return_latents: + return image, latent + else: + return image, None + + +class ScaledLeakyReLU(nn.Module): + """Scaled LeakyReLU. + + Args: + negative_slope (float): Negative slope. Default: 0.2. + """ + + def __init__(self, negative_slope=0.2): + super(ScaledLeakyReLU, self).__init__() + self.negative_slope = negative_slope + + def forward(self, x): + out = F.leaky_relu(x, negative_slope=self.negative_slope) + return out * math.sqrt(2) + + +class EqualConv2d(nn.Module): + """Equalized Linear as StyleGAN2. + + Args: + in_channels (int): Channel number of the input. + out_channels (int): Channel number of the output. + kernel_size (int): Size of the convolving kernel. + stride (int): Stride of the convolution. Default: 1 + padding (int): Zero-padding added to both sides of the input. + Default: 0. + bias (bool): If ``True``, adds a learnable bias to the output. + Default: ``True``. + bias_init_val (float): Bias initialized value. Default: 0. + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + bias=True, + bias_init_val=0, + ): + super(EqualConv2d, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.scale = 1 / math.sqrt(in_channels * kernel_size**2) + + self.weight = nn.Parameter( + torch.randn(out_channels, in_channels, kernel_size, kernel_size) + ) + if bias: + self.bias = nn.Parameter(torch.zeros(out_channels).fill_(bias_init_val)) + else: + self.register_parameter("bias", None) + + def forward(self, x): + out = F.conv2d( + x, + self.weight * self.scale, + bias=self.bias, + stride=self.stride, + padding=self.padding, + ) + + return out + + def __repr__(self): + return ( + f"{self.__class__.__name__}(in_channels={self.in_channels}, " + f"out_channels={self.out_channels}, " + f"kernel_size={self.kernel_size}," + f" stride={self.stride}, padding={self.padding}, " + f"bias={self.bias is not None})" + ) + + +class ConvLayer(nn.Sequential): + """Conv Layer used in StyleGAN2 Discriminator. + + Args: + in_channels (int): Channel number of the input. + out_channels (int): Channel number of the output. + kernel_size (int): Kernel size. + downsample (bool): Whether downsample by a factor of 2. + Default: False. + resample_kernel (list[int]): A list indicating the 1D resample + kernel magnitude. A cross production will be applied to + extent 1D resample kernel to 2D resample kernel. + Default: (1, 3, 3, 1). + bias (bool): Whether with bias. Default: True. + activate (bool): Whether use activateion. Default: True. + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + downsample=False, + resample_kernel=(1, 3, 3, 1), + bias=True, + activate=True, + ): + layers = [] + # downsample + if downsample: + layers.append( + UpFirDnSmooth( + resample_kernel, + upsample_factor=1, + downsample_factor=2, + kernel_size=kernel_size, + ) + ) + stride = 2 + self.padding = 0 + else: + stride = 1 + self.padding = kernel_size // 2 + # conv + layers.append( + EqualConv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=self.padding, + bias=bias and not activate, + ) + ) + # activation + if activate: + if bias: + layers.append(FusedLeakyReLU(out_channels)) + else: + layers.append(ScaledLeakyReLU(0.2)) + + super(ConvLayer, self).__init__(*layers) + + +class ResBlock(nn.Module): + """Residual block used in StyleGAN2 Discriminator. + + Args: + in_channels (int): Channel number of the input. + out_channels (int): Channel number of the output. + resample_kernel (list[int]): A list indicating the 1D resample + kernel magnitude. A cross production will be applied to + extent 1D resample kernel to 2D resample kernel. + Default: (1, 3, 3, 1). + """ + + def __init__(self, in_channels, out_channels, resample_kernel=(1, 3, 3, 1)): + super(ResBlock, self).__init__() + + self.conv1 = ConvLayer(in_channels, in_channels, 3, bias=True, activate=True) + self.conv2 = ConvLayer( + in_channels, + out_channels, + 3, + downsample=True, + resample_kernel=resample_kernel, + bias=True, + activate=True, + ) + self.skip = ConvLayer( + in_channels, + out_channels, + 1, + downsample=True, + resample_kernel=resample_kernel, + bias=False, + activate=False, + ) + + def forward(self, x): + out = self.conv1(x) + out = self.conv2(out) + skip = self.skip(x) + out = (out + skip) / math.sqrt(2) + return out diff --git a/ldm_patched/pfn/architecture/face/stylegan2_bilinear_arch.py b/ldm_patched/pfn/architecture/face/stylegan2_bilinear_arch.py new file mode 100644 index 0000000000000000000000000000000000000000..601f8cc4b33bdbb371d710a2bb0656e8ce102e26 --- /dev/null +++ b/ldm_patched/pfn/architecture/face/stylegan2_bilinear_arch.py @@ -0,0 +1,709 @@ +# pylint: skip-file +# type: ignore +import math +import random + +import torch +from torch import nn +from torch.nn import functional as F + +from .fused_act import FusedLeakyReLU, fused_leaky_relu + + +class NormStyleCode(nn.Module): + def forward(self, x): + """Normalize the style codes. + Args: + x (Tensor): Style codes with shape (b, c). + Returns: + Tensor: Normalized tensor. + """ + return x * torch.rsqrt(torch.mean(x**2, dim=1, keepdim=True) + 1e-8) + + +class EqualLinear(nn.Module): + """Equalized Linear as StyleGAN2. + Args: + in_channels (int): Size of each sample. + out_channels (int): Size of each output sample. + bias (bool): If set to ``False``, the layer will not learn an additive + bias. Default: ``True``. + bias_init_val (float): Bias initialized value. Default: 0. + lr_mul (float): Learning rate multiplier. Default: 1. + activation (None | str): The activation after ``linear`` operation. + Supported: 'fused_lrelu', None. Default: None. + """ + + def __init__( + self, + in_channels, + out_channels, + bias=True, + bias_init_val=0, + lr_mul=1, + activation=None, + ): + super(EqualLinear, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.lr_mul = lr_mul + self.activation = activation + if self.activation not in ["fused_lrelu", None]: + raise ValueError( + f"Wrong activation value in EqualLinear: {activation}" + "Supported ones are: ['fused_lrelu', None]." + ) + self.scale = (1 / math.sqrt(in_channels)) * lr_mul + + self.weight = nn.Parameter(torch.randn(out_channels, in_channels).div_(lr_mul)) + if bias: + self.bias = nn.Parameter(torch.zeros(out_channels).fill_(bias_init_val)) + else: + self.register_parameter("bias", None) + + def forward(self, x): + if self.bias is None: + bias = None + else: + bias = self.bias * self.lr_mul + if self.activation == "fused_lrelu": + out = F.linear(x, self.weight * self.scale) + out = fused_leaky_relu(out, bias) + else: + out = F.linear(x, self.weight * self.scale, bias=bias) + return out + + def __repr__(self): + return ( + f"{self.__class__.__name__}(in_channels={self.in_channels}, " + f"out_channels={self.out_channels}, bias={self.bias is not None})" + ) + + +class ModulatedConv2d(nn.Module): + """Modulated Conv2d used in StyleGAN2. + There is no bias in ModulatedConv2d. + Args: + in_channels (int): Channel number of the input. + out_channels (int): Channel number of the output. + kernel_size (int): Size of the convolving kernel. + num_style_feat (int): Channel number of style features. + demodulate (bool): Whether to demodulate in the conv layer. + Default: True. + sample_mode (str | None): Indicating 'upsample', 'downsample' or None. + Default: None. + eps (float): A value added to the denominator for numerical stability. + Default: 1e-8. + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + num_style_feat, + demodulate=True, + sample_mode=None, + eps=1e-8, + interpolation_mode="bilinear", + ): + super(ModulatedConv2d, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.demodulate = demodulate + self.sample_mode = sample_mode + self.eps = eps + self.interpolation_mode = interpolation_mode + if self.interpolation_mode == "nearest": + self.align_corners = None + else: + self.align_corners = False + + self.scale = 1 / math.sqrt(in_channels * kernel_size**2) + # modulation inside each modulated conv + self.modulation = EqualLinear( + num_style_feat, + in_channels, + bias=True, + bias_init_val=1, + lr_mul=1, + activation=None, + ) + + self.weight = nn.Parameter( + torch.randn(1, out_channels, in_channels, kernel_size, kernel_size) + ) + self.padding = kernel_size // 2 + + def forward(self, x, style): + """Forward function. + Args: + x (Tensor): Tensor with shape (b, c, h, w). + style (Tensor): Tensor with shape (b, num_style_feat). + Returns: + Tensor: Modulated tensor after convolution. + """ + b, c, h, w = x.shape # c = c_in + # weight modulation + style = self.modulation(style).view(b, 1, c, 1, 1) + # self.weight: (1, c_out, c_in, k, k); style: (b, 1, c, 1, 1) + weight = self.scale * self.weight * style # (b, c_out, c_in, k, k) + + if self.demodulate: + demod = torch.rsqrt(weight.pow(2).sum([2, 3, 4]) + self.eps) + weight = weight * demod.view(b, self.out_channels, 1, 1, 1) + + weight = weight.view( + b * self.out_channels, c, self.kernel_size, self.kernel_size + ) + + if self.sample_mode == "upsample": + x = F.interpolate( + x, + scale_factor=2, + mode=self.interpolation_mode, + align_corners=self.align_corners, + ) + elif self.sample_mode == "downsample": + x = F.interpolate( + x, + scale_factor=0.5, + mode=self.interpolation_mode, + align_corners=self.align_corners, + ) + + b, c, h, w = x.shape + x = x.view(1, b * c, h, w) + # weight: (b*c_out, c_in, k, k), groups=b + out = F.conv2d(x, weight, padding=self.padding, groups=b) + out = out.view(b, self.out_channels, *out.shape[2:4]) + + return out + + def __repr__(self): + return ( + f"{self.__class__.__name__}(in_channels={self.in_channels}, " + f"out_channels={self.out_channels}, " + f"kernel_size={self.kernel_size}, " + f"demodulate={self.demodulate}, sample_mode={self.sample_mode})" + ) + + +class StyleConv(nn.Module): + """Style conv. + Args: + in_channels (int): Channel number of the input. + out_channels (int): Channel number of the output. + kernel_size (int): Size of the convolving kernel. + num_style_feat (int): Channel number of style features. + demodulate (bool): Whether demodulate in the conv layer. Default: True. + sample_mode (str | None): Indicating 'upsample', 'downsample' or None. + Default: None. + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + num_style_feat, + demodulate=True, + sample_mode=None, + interpolation_mode="bilinear", + ): + super(StyleConv, self).__init__() + self.modulated_conv = ModulatedConv2d( + in_channels, + out_channels, + kernel_size, + num_style_feat, + demodulate=demodulate, + sample_mode=sample_mode, + interpolation_mode=interpolation_mode, + ) + self.weight = nn.Parameter(torch.zeros(1)) # for noise injection + self.activate = FusedLeakyReLU(out_channels) + + def forward(self, x, style, noise=None): + # modulate + out = self.modulated_conv(x, style) + # noise injection + if noise is None: + b, _, h, w = out.shape + noise = out.new_empty(b, 1, h, w).normal_() + out = out + self.weight * noise + # activation (with bias) + out = self.activate(out) + return out + + +class ToRGB(nn.Module): + """To RGB from features. + Args: + in_channels (int): Channel number of input. + num_style_feat (int): Channel number of style features. + upsample (bool): Whether to upsample. Default: True. + """ + + def __init__( + self, in_channels, num_style_feat, upsample=True, interpolation_mode="bilinear" + ): + super(ToRGB, self).__init__() + self.upsample = upsample + self.interpolation_mode = interpolation_mode + if self.interpolation_mode == "nearest": + self.align_corners = None + else: + self.align_corners = False + self.modulated_conv = ModulatedConv2d( + in_channels, + 3, + kernel_size=1, + num_style_feat=num_style_feat, + demodulate=False, + sample_mode=None, + interpolation_mode=interpolation_mode, + ) + self.bias = nn.Parameter(torch.zeros(1, 3, 1, 1)) + + def forward(self, x, style, skip=None): + """Forward function. + Args: + x (Tensor): Feature tensor with shape (b, c, h, w). + style (Tensor): Tensor with shape (b, num_style_feat). + skip (Tensor): Base/skip tensor. Default: None. + Returns: + Tensor: RGB images. + """ + out = self.modulated_conv(x, style) + out = out + self.bias + if skip is not None: + if self.upsample: + skip = F.interpolate( + skip, + scale_factor=2, + mode=self.interpolation_mode, + align_corners=self.align_corners, + ) + out = out + skip + return out + + +class ConstantInput(nn.Module): + """Constant input. + Args: + num_channel (int): Channel number of constant input. + size (int): Spatial size of constant input. + """ + + def __init__(self, num_channel, size): + super(ConstantInput, self).__init__() + self.weight = nn.Parameter(torch.randn(1, num_channel, size, size)) + + def forward(self, batch): + out = self.weight.repeat(batch, 1, 1, 1) + return out + + +class StyleGAN2GeneratorBilinear(nn.Module): + """StyleGAN2 Generator. + Args: + out_size (int): The spatial size of outputs. + num_style_feat (int): Channel number of style features. Default: 512. + num_mlp (int): Layer number of MLP style layers. Default: 8. + channel_multiplier (int): Channel multiplier for large networks of + StyleGAN2. Default: 2. + lr_mlp (float): Learning rate multiplier for mlp layers. Default: 0.01. + narrow (float): Narrow ratio for channels. Default: 1.0. + """ + + def __init__( + self, + out_size, + num_style_feat=512, + num_mlp=8, + channel_multiplier=2, + lr_mlp=0.01, + narrow=1, + interpolation_mode="bilinear", + ): + super(StyleGAN2GeneratorBilinear, self).__init__() + # Style MLP layers + self.num_style_feat = num_style_feat + style_mlp_layers = [NormStyleCode()] + for i in range(num_mlp): + style_mlp_layers.append( + EqualLinear( + num_style_feat, + num_style_feat, + bias=True, + bias_init_val=0, + lr_mul=lr_mlp, + activation="fused_lrelu", + ) + ) + self.style_mlp = nn.Sequential(*style_mlp_layers) + + channels = { + "4": int(512 * narrow), + "8": int(512 * narrow), + "16": int(512 * narrow), + "32": int(512 * narrow), + "64": int(256 * channel_multiplier * narrow), + "128": int(128 * channel_multiplier * narrow), + "256": int(64 * channel_multiplier * narrow), + "512": int(32 * channel_multiplier * narrow), + "1024": int(16 * channel_multiplier * narrow), + } + self.channels = channels + + self.constant_input = ConstantInput(channels["4"], size=4) + self.style_conv1 = StyleConv( + channels["4"], + channels["4"], + kernel_size=3, + num_style_feat=num_style_feat, + demodulate=True, + sample_mode=None, + interpolation_mode=interpolation_mode, + ) + self.to_rgb1 = ToRGB( + channels["4"], + num_style_feat, + upsample=False, + interpolation_mode=interpolation_mode, + ) + + self.log_size = int(math.log(out_size, 2)) + self.num_layers = (self.log_size - 2) * 2 + 1 + self.num_latent = self.log_size * 2 - 2 + + self.style_convs = nn.ModuleList() + self.to_rgbs = nn.ModuleList() + self.noises = nn.Module() + + in_channels = channels["4"] + # noise + for layer_idx in range(self.num_layers): + resolution = 2 ** ((layer_idx + 5) // 2) + shape = [1, 1, resolution, resolution] + self.noises.register_buffer(f"noise{layer_idx}", torch.randn(*shape)) + # style convs and to_rgbs + for i in range(3, self.log_size + 1): + out_channels = channels[f"{2**i}"] + self.style_convs.append( + StyleConv( + in_channels, + out_channels, + kernel_size=3, + num_style_feat=num_style_feat, + demodulate=True, + sample_mode="upsample", + interpolation_mode=interpolation_mode, + ) + ) + self.style_convs.append( + StyleConv( + out_channels, + out_channels, + kernel_size=3, + num_style_feat=num_style_feat, + demodulate=True, + sample_mode=None, + interpolation_mode=interpolation_mode, + ) + ) + self.to_rgbs.append( + ToRGB( + out_channels, + num_style_feat, + upsample=True, + interpolation_mode=interpolation_mode, + ) + ) + in_channels = out_channels + + def make_noise(self): + """Make noise for noise injection.""" + device = self.constant_input.weight.device + noises = [torch.randn(1, 1, 4, 4, device=device)] + + for i in range(3, self.log_size + 1): + for _ in range(2): + noises.append(torch.randn(1, 1, 2**i, 2**i, device=device)) + + return noises + + def get_latent(self, x): + return self.style_mlp(x) + + def mean_latent(self, num_latent): + latent_in = torch.randn( + num_latent, self.num_style_feat, device=self.constant_input.weight.device + ) + latent = self.style_mlp(latent_in).mean(0, keepdim=True) + return latent + + def forward( + self, + styles, + input_is_latent=False, + noise=None, + randomize_noise=True, + truncation=1, + truncation_latent=None, + inject_index=None, + return_latents=False, + ): + """Forward function for StyleGAN2Generator. + Args: + styles (list[Tensor]): Sample codes of styles. + input_is_latent (bool): Whether input is latent style. + Default: False. + noise (Tensor | None): Input noise or None. Default: None. + randomize_noise (bool): Randomize noise, used when 'noise' is + False. Default: True. + truncation (float): TODO. Default: 1. + truncation_latent (Tensor | None): TODO. Default: None. + inject_index (int | None): The injection index for mixing noise. + Default: None. + return_latents (bool): Whether to return style latents. + Default: False. + """ + # style codes -> latents with Style MLP layer + if not input_is_latent: + styles = [self.style_mlp(s) for s in styles] + # noises + if noise is None: + if randomize_noise: + noise = [None] * self.num_layers # for each style conv layer + else: # use the stored noise + noise = [ + getattr(self.noises, f"noise{i}") for i in range(self.num_layers) + ] + # style truncation + if truncation < 1: + style_truncation = [] + for style in styles: + style_truncation.append( + truncation_latent + truncation * (style - truncation_latent) + ) + styles = style_truncation + # get style latent with injection + if len(styles) == 1: + inject_index = self.num_latent + + if styles[0].ndim < 3: + # repeat latent code for all the layers + latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1) + else: # used for encoder with different latent code for each layer + latent = styles[0] + elif len(styles) == 2: # mixing noises + if inject_index is None: + inject_index = random.randint(1, self.num_latent - 1) + latent1 = styles[0].unsqueeze(1).repeat(1, inject_index, 1) + latent2 = ( + styles[1].unsqueeze(1).repeat(1, self.num_latent - inject_index, 1) + ) + latent = torch.cat([latent1, latent2], 1) + + # main generation + out = self.constant_input(latent.shape[0]) + out = self.style_conv1(out, latent[:, 0], noise=noise[0]) + skip = self.to_rgb1(out, latent[:, 1]) + + i = 1 + for conv1, conv2, noise1, noise2, to_rgb in zip( + self.style_convs[::2], + self.style_convs[1::2], + noise[1::2], + noise[2::2], + self.to_rgbs, + ): + out = conv1(out, latent[:, i], noise=noise1) + out = conv2(out, latent[:, i + 1], noise=noise2) + skip = to_rgb(out, latent[:, i + 2], skip) + i += 2 + + image = skip + + if return_latents: + return image, latent + else: + return image, None + + +class ScaledLeakyReLU(nn.Module): + """Scaled LeakyReLU. + Args: + negative_slope (float): Negative slope. Default: 0.2. + """ + + def __init__(self, negative_slope=0.2): + super(ScaledLeakyReLU, self).__init__() + self.negative_slope = negative_slope + + def forward(self, x): + out = F.leaky_relu(x, negative_slope=self.negative_slope) + return out * math.sqrt(2) + + +class EqualConv2d(nn.Module): + """Equalized Linear as StyleGAN2. + Args: + in_channels (int): Channel number of the input. + out_channels (int): Channel number of the output. + kernel_size (int): Size of the convolving kernel. + stride (int): Stride of the convolution. Default: 1 + padding (int): Zero-padding added to both sides of the input. + Default: 0. + bias (bool): If ``True``, adds a learnable bias to the output. + Default: ``True``. + bias_init_val (float): Bias initialized value. Default: 0. + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + bias=True, + bias_init_val=0, + ): + super(EqualConv2d, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + self.scale = 1 / math.sqrt(in_channels * kernel_size**2) + + self.weight = nn.Parameter( + torch.randn(out_channels, in_channels, kernel_size, kernel_size) + ) + if bias: + self.bias = nn.Parameter(torch.zeros(out_channels).fill_(bias_init_val)) + else: + self.register_parameter("bias", None) + + def forward(self, x): + out = F.conv2d( + x, + self.weight * self.scale, + bias=self.bias, + stride=self.stride, + padding=self.padding, + ) + + return out + + def __repr__(self): + return ( + f"{self.__class__.__name__}(in_channels={self.in_channels}, " + f"out_channels={self.out_channels}, " + f"kernel_size={self.kernel_size}," + f" stride={self.stride}, padding={self.padding}, " + f"bias={self.bias is not None})" + ) + + +class ConvLayer(nn.Sequential): + """Conv Layer used in StyleGAN2 Discriminator. + Args: + in_channels (int): Channel number of the input. + out_channels (int): Channel number of the output. + kernel_size (int): Kernel size. + downsample (bool): Whether downsample by a factor of 2. + Default: False. + bias (bool): Whether with bias. Default: True. + activate (bool): Whether use activateion. Default: True. + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + downsample=False, + bias=True, + activate=True, + interpolation_mode="bilinear", + ): + layers = [] + self.interpolation_mode = interpolation_mode + # downsample + if downsample: + if self.interpolation_mode == "nearest": + self.align_corners = None + else: + self.align_corners = False + + layers.append( + torch.nn.Upsample( + scale_factor=0.5, + mode=interpolation_mode, + align_corners=self.align_corners, + ) + ) + stride = 1 + self.padding = kernel_size // 2 + # conv + layers.append( + EqualConv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=self.padding, + bias=bias and not activate, + ) + ) + # activation + if activate: + if bias: + layers.append(FusedLeakyReLU(out_channels)) + else: + layers.append(ScaledLeakyReLU(0.2)) + + super(ConvLayer, self).__init__(*layers) + + +class ResBlock(nn.Module): + """Residual block used in StyleGAN2 Discriminator. + Args: + in_channels (int): Channel number of the input. + out_channels (int): Channel number of the output. + """ + + def __init__(self, in_channels, out_channels, interpolation_mode="bilinear"): + super(ResBlock, self).__init__() + + self.conv1 = ConvLayer(in_channels, in_channels, 3, bias=True, activate=True) + self.conv2 = ConvLayer( + in_channels, + out_channels, + 3, + downsample=True, + interpolation_mode=interpolation_mode, + bias=True, + activate=True, + ) + self.skip = ConvLayer( + in_channels, + out_channels, + 1, + downsample=True, + interpolation_mode=interpolation_mode, + bias=False, + activate=False, + ) + + def forward(self, x): + out = self.conv1(x) + out = self.conv2(out) + skip = self.skip(x) + out = (out + skip) / math.sqrt(2) + return out diff --git a/ldm_patched/pfn/architecture/face/stylegan2_clean_arch.py b/ldm_patched/pfn/architecture/face/stylegan2_clean_arch.py new file mode 100644 index 0000000000000000000000000000000000000000..c48de9af6904b8d1891a84efa8e4d76104d5d710 --- /dev/null +++ b/ldm_patched/pfn/architecture/face/stylegan2_clean_arch.py @@ -0,0 +1,453 @@ +# pylint: skip-file +# type: ignore +import math + +import torch +from torch import nn +from torch.nn import functional as F +from torch.nn import init +from torch.nn.modules.batchnorm import _BatchNorm + + +@torch.no_grad() +def default_init_weights(module_list, scale=1, bias_fill=0, **kwargs): + """Initialize network weights. + Args: + module_list (list[nn.Module] | nn.Module): Modules to be initialized. + scale (float): Scale initialized weights, especially for residual + blocks. Default: 1. + bias_fill (float): The value to fill bias. Default: 0 + kwargs (dict): Other arguments for initialization function. + """ + if not isinstance(module_list, list): + module_list = [module_list] + for module in module_list: + for m in module.modules(): + if isinstance(m, nn.Conv2d): + init.kaiming_normal_(m.weight, **kwargs) + m.weight.data *= scale + if m.bias is not None: + m.bias.data.fill_(bias_fill) + elif isinstance(m, nn.Linear): + init.kaiming_normal_(m.weight, **kwargs) + m.weight.data *= scale + if m.bias is not None: + m.bias.data.fill_(bias_fill) + elif isinstance(m, _BatchNorm): + init.constant_(m.weight, 1) + if m.bias is not None: + m.bias.data.fill_(bias_fill) + + +class NormStyleCode(nn.Module): + def forward(self, x): + """Normalize the style codes. + Args: + x (Tensor): Style codes with shape (b, c). + Returns: + Tensor: Normalized tensor. + """ + return x * torch.rsqrt(torch.mean(x**2, dim=1, keepdim=True) + 1e-8) + + +class ModulatedConv2d(nn.Module): + """Modulated Conv2d used in StyleGAN2. + There is no bias in ModulatedConv2d. + Args: + in_channels (int): Channel number of the input. + out_channels (int): Channel number of the output. + kernel_size (int): Size of the convolving kernel. + num_style_feat (int): Channel number of style features. + demodulate (bool): Whether to demodulate in the conv layer. Default: True. + sample_mode (str | None): Indicating 'upsample', 'downsample' or None. Default: None. + eps (float): A value added to the denominator for numerical stability. Default: 1e-8. + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + num_style_feat, + demodulate=True, + sample_mode=None, + eps=1e-8, + ): + super(ModulatedConv2d, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.demodulate = demodulate + self.sample_mode = sample_mode + self.eps = eps + + # modulation inside each modulated conv + self.modulation = nn.Linear(num_style_feat, in_channels, bias=True) + # initialization + default_init_weights( + self.modulation, + scale=1, + bias_fill=1, + a=0, + mode="fan_in", + nonlinearity="linear", + ) + + self.weight = nn.Parameter( + torch.randn(1, out_channels, in_channels, kernel_size, kernel_size) + / math.sqrt(in_channels * kernel_size**2) + ) + self.padding = kernel_size // 2 + + def forward(self, x, style): + """Forward function. + Args: + x (Tensor): Tensor with shape (b, c, h, w). + style (Tensor): Tensor with shape (b, num_style_feat). + Returns: + Tensor: Modulated tensor after convolution. + """ + b, c, h, w = x.shape # c = c_in + # weight modulation + style = self.modulation(style).view(b, 1, c, 1, 1) + # self.weight: (1, c_out, c_in, k, k); style: (b, 1, c, 1, 1) + weight = self.weight * style # (b, c_out, c_in, k, k) + + if self.demodulate: + demod = torch.rsqrt(weight.pow(2).sum([2, 3, 4]) + self.eps) + weight = weight * demod.view(b, self.out_channels, 1, 1, 1) + + weight = weight.view( + b * self.out_channels, c, self.kernel_size, self.kernel_size + ) + + # upsample or downsample if necessary + if self.sample_mode == "upsample": + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=False) + elif self.sample_mode == "downsample": + x = F.interpolate(x, scale_factor=0.5, mode="bilinear", align_corners=False) + + b, c, h, w = x.shape + x = x.view(1, b * c, h, w) + # weight: (b*c_out, c_in, k, k), groups=b + out = F.conv2d(x, weight, padding=self.padding, groups=b) + out = out.view(b, self.out_channels, *out.shape[2:4]) + + return out + + def __repr__(self): + return ( + f"{self.__class__.__name__}(in_channels={self.in_channels}, out_channels={self.out_channels}, " + f"kernel_size={self.kernel_size}, demodulate={self.demodulate}, sample_mode={self.sample_mode})" + ) + + +class StyleConv(nn.Module): + """Style conv used in StyleGAN2. + Args: + in_channels (int): Channel number of the input. + out_channels (int): Channel number of the output. + kernel_size (int): Size of the convolving kernel. + num_style_feat (int): Channel number of style features. + demodulate (bool): Whether demodulate in the conv layer. Default: True. + sample_mode (str | None): Indicating 'upsample', 'downsample' or None. Default: None. + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + num_style_feat, + demodulate=True, + sample_mode=None, + ): + super(StyleConv, self).__init__() + self.modulated_conv = ModulatedConv2d( + in_channels, + out_channels, + kernel_size, + num_style_feat, + demodulate=demodulate, + sample_mode=sample_mode, + ) + self.weight = nn.Parameter(torch.zeros(1)) # for noise injection + self.bias = nn.Parameter(torch.zeros(1, out_channels, 1, 1)) + self.activate = nn.LeakyReLU(negative_slope=0.2, inplace=True) + + def forward(self, x, style, noise=None): + # modulate + out = self.modulated_conv(x, style) * 2**0.5 # for conversion + # noise injection + if noise is None: + b, _, h, w = out.shape + noise = out.new_empty(b, 1, h, w).normal_() + out = out + self.weight * noise + # add bias + out = out + self.bias + # activation + out = self.activate(out) + return out + + +class ToRGB(nn.Module): + """To RGB (image space) from features. + Args: + in_channels (int): Channel number of input. + num_style_feat (int): Channel number of style features. + upsample (bool): Whether to upsample. Default: True. + """ + + def __init__(self, in_channels, num_style_feat, upsample=True): + super(ToRGB, self).__init__() + self.upsample = upsample + self.modulated_conv = ModulatedConv2d( + in_channels, + 3, + kernel_size=1, + num_style_feat=num_style_feat, + demodulate=False, + sample_mode=None, + ) + self.bias = nn.Parameter(torch.zeros(1, 3, 1, 1)) + + def forward(self, x, style, skip=None): + """Forward function. + Args: + x (Tensor): Feature tensor with shape (b, c, h, w). + style (Tensor): Tensor with shape (b, num_style_feat). + skip (Tensor): Base/skip tensor. Default: None. + Returns: + Tensor: RGB images. + """ + out = self.modulated_conv(x, style) + out = out + self.bias + if skip is not None: + if self.upsample: + skip = F.interpolate( + skip, scale_factor=2, mode="bilinear", align_corners=False + ) + out = out + skip + return out + + +class ConstantInput(nn.Module): + """Constant input. + Args: + num_channel (int): Channel number of constant input. + size (int): Spatial size of constant input. + """ + + def __init__(self, num_channel, size): + super(ConstantInput, self).__init__() + self.weight = nn.Parameter(torch.randn(1, num_channel, size, size)) + + def forward(self, batch): + out = self.weight.repeat(batch, 1, 1, 1) + return out + + +class StyleGAN2GeneratorClean(nn.Module): + """Clean version of StyleGAN2 Generator. + Args: + out_size (int): The spatial size of outputs. + num_style_feat (int): Channel number of style features. Default: 512. + num_mlp (int): Layer number of MLP style layers. Default: 8. + channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2. + narrow (float): Narrow ratio for channels. Default: 1.0. + """ + + def __init__( + self, out_size, num_style_feat=512, num_mlp=8, channel_multiplier=2, narrow=1 + ): + super(StyleGAN2GeneratorClean, self).__init__() + # Style MLP layers + self.num_style_feat = num_style_feat + style_mlp_layers = [NormStyleCode()] + for i in range(num_mlp): + style_mlp_layers.extend( + [ + nn.Linear(num_style_feat, num_style_feat, bias=True), + nn.LeakyReLU(negative_slope=0.2, inplace=True), + ] + ) + self.style_mlp = nn.Sequential(*style_mlp_layers) + # initialization + default_init_weights( + self.style_mlp, + scale=1, + bias_fill=0, + a=0.2, + mode="fan_in", + nonlinearity="leaky_relu", + ) + + # channel list + channels = { + "4": int(512 * narrow), + "8": int(512 * narrow), + "16": int(512 * narrow), + "32": int(512 * narrow), + "64": int(256 * channel_multiplier * narrow), + "128": int(128 * channel_multiplier * narrow), + "256": int(64 * channel_multiplier * narrow), + "512": int(32 * channel_multiplier * narrow), + "1024": int(16 * channel_multiplier * narrow), + } + self.channels = channels + + self.constant_input = ConstantInput(channels["4"], size=4) + self.style_conv1 = StyleConv( + channels["4"], + channels["4"], + kernel_size=3, + num_style_feat=num_style_feat, + demodulate=True, + sample_mode=None, + ) + self.to_rgb1 = ToRGB(channels["4"], num_style_feat, upsample=False) + + self.log_size = int(math.log(out_size, 2)) + self.num_layers = (self.log_size - 2) * 2 + 1 + self.num_latent = self.log_size * 2 - 2 + + self.style_convs = nn.ModuleList() + self.to_rgbs = nn.ModuleList() + self.noises = nn.Module() + + in_channels = channels["4"] + # noise + for layer_idx in range(self.num_layers): + resolution = 2 ** ((layer_idx + 5) // 2) + shape = [1, 1, resolution, resolution] + self.noises.register_buffer(f"noise{layer_idx}", torch.randn(*shape)) + # style convs and to_rgbs + for i in range(3, self.log_size + 1): + out_channels = channels[f"{2**i}"] + self.style_convs.append( + StyleConv( + in_channels, + out_channels, + kernel_size=3, + num_style_feat=num_style_feat, + demodulate=True, + sample_mode="upsample", + ) + ) + self.style_convs.append( + StyleConv( + out_channels, + out_channels, + kernel_size=3, + num_style_feat=num_style_feat, + demodulate=True, + sample_mode=None, + ) + ) + self.to_rgbs.append(ToRGB(out_channels, num_style_feat, upsample=True)) + in_channels = out_channels + + def make_noise(self): + """Make noise for noise injection.""" + device = self.constant_input.weight.device + noises = [torch.randn(1, 1, 4, 4, device=device)] + + for i in range(3, self.log_size + 1): + for _ in range(2): + noises.append(torch.randn(1, 1, 2**i, 2**i, device=device)) + + return noises + + def get_latent(self, x): + return self.style_mlp(x) + + def mean_latent(self, num_latent): + latent_in = torch.randn( + num_latent, self.num_style_feat, device=self.constant_input.weight.device + ) + latent = self.style_mlp(latent_in).mean(0, keepdim=True) + return latent + + def forward( + self, + styles, + input_is_latent=False, + noise=None, + randomize_noise=True, + truncation=1, + truncation_latent=None, + inject_index=None, + return_latents=False, + ): + """Forward function for StyleGAN2GeneratorClean. + Args: + styles (list[Tensor]): Sample codes of styles. + input_is_latent (bool): Whether input is latent style. Default: False. + noise (Tensor | None): Input noise or None. Default: None. + randomize_noise (bool): Randomize noise, used when 'noise' is False. Default: True. + truncation (float): The truncation ratio. Default: 1. + truncation_latent (Tensor | None): The truncation latent tensor. Default: None. + inject_index (int | None): The injection index for mixing noise. Default: None. + return_latents (bool): Whether to return style latents. Default: False. + """ + # style codes -> latents with Style MLP layer + if not input_is_latent: + styles = [self.style_mlp(s) for s in styles] + # noises + if noise is None: + if randomize_noise: + noise = [None] * self.num_layers # for each style conv layer + else: # use the stored noise + noise = [ + getattr(self.noises, f"noise{i}") for i in range(self.num_layers) + ] + # style truncation + if truncation < 1: + style_truncation = [] + for style in styles: + style_truncation.append( + truncation_latent + truncation * (style - truncation_latent) + ) + styles = style_truncation + # get style latents with injection + if len(styles) == 1: + inject_index = self.num_latent + + if styles[0].ndim < 3: + # repeat latent code for all the layers + latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1) + else: # used for encoder with different latent code for each layer + latent = styles[0] + elif len(styles) == 2: # mixing noises + if inject_index is None: + inject_index = random.randint(1, self.num_latent - 1) + latent1 = styles[0].unsqueeze(1).repeat(1, inject_index, 1) + latent2 = ( + styles[1].unsqueeze(1).repeat(1, self.num_latent - inject_index, 1) + ) + latent = torch.cat([latent1, latent2], 1) + + # main generation + out = self.constant_input(latent.shape[0]) + out = self.style_conv1(out, latent[:, 0], noise=noise[0]) + skip = self.to_rgb1(out, latent[:, 1]) + + i = 1 + for conv1, conv2, noise1, noise2, to_rgb in zip( + self.style_convs[::2], + self.style_convs[1::2], + noise[1::2], + noise[2::2], + self.to_rgbs, + ): + out = conv1(out, latent[:, i], noise=noise1) + out = conv2(out, latent[:, i + 1], noise=noise2) + skip = to_rgb(out, latent[:, i + 2], skip) # feature back to the rgb space + i += 2 + + image = skip + + if return_latents: + return image, latent + else: + return image, None diff --git a/ldm_patched/pfn/architecture/face/upfirdn2d.py b/ldm_patched/pfn/architecture/face/upfirdn2d.py new file mode 100644 index 0000000000000000000000000000000000000000..4ea4541513f27e3c9dddcee864cfeb87efddadb7 --- /dev/null +++ b/ldm_patched/pfn/architecture/face/upfirdn2d.py @@ -0,0 +1,194 @@ +# pylint: skip-file +# type: ignore +# modify from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.py # noqa:E501 + +import os + +import torch +from torch.autograd import Function +from torch.nn import functional as F + +upfirdn2d_ext = None + + +class UpFirDn2dBackward(Function): + @staticmethod + def forward( + ctx, grad_output, kernel, grad_kernel, up, down, pad, g_pad, in_size, out_size + ): + up_x, up_y = up + down_x, down_y = down + g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1 = g_pad + + grad_output = grad_output.reshape(-1, out_size[0], out_size[1], 1) + + grad_input = upfirdn2d_ext.upfirdn2d( + grad_output, + grad_kernel, + down_x, + down_y, + up_x, + up_y, + g_pad_x0, + g_pad_x1, + g_pad_y0, + g_pad_y1, + ) + grad_input = grad_input.view(in_size[0], in_size[1], in_size[2], in_size[3]) + + ctx.save_for_backward(kernel) + + pad_x0, pad_x1, pad_y0, pad_y1 = pad + + ctx.up_x = up_x + ctx.up_y = up_y + ctx.down_x = down_x + ctx.down_y = down_y + ctx.pad_x0 = pad_x0 + ctx.pad_x1 = pad_x1 + ctx.pad_y0 = pad_y0 + ctx.pad_y1 = pad_y1 + ctx.in_size = in_size + ctx.out_size = out_size + + return grad_input + + @staticmethod + def backward(ctx, gradgrad_input): + (kernel,) = ctx.saved_tensors + + gradgrad_input = gradgrad_input.reshape(-1, ctx.in_size[2], ctx.in_size[3], 1) + + gradgrad_out = upfirdn2d_ext.upfirdn2d( + gradgrad_input, + kernel, + ctx.up_x, + ctx.up_y, + ctx.down_x, + ctx.down_y, + ctx.pad_x0, + ctx.pad_x1, + ctx.pad_y0, + ctx.pad_y1, + ) + # gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.out_size[0], + # ctx.out_size[1], ctx.in_size[3]) + gradgrad_out = gradgrad_out.view( + ctx.in_size[0], ctx.in_size[1], ctx.out_size[0], ctx.out_size[1] + ) + + return gradgrad_out, None, None, None, None, None, None, None, None + + +class UpFirDn2d(Function): + @staticmethod + def forward(ctx, input, kernel, up, down, pad): + up_x, up_y = up + down_x, down_y = down + pad_x0, pad_x1, pad_y0, pad_y1 = pad + + kernel_h, kernel_w = kernel.shape + _, channel, in_h, in_w = input.shape + ctx.in_size = input.shape + + input = input.reshape(-1, in_h, in_w, 1) + + ctx.save_for_backward(kernel, torch.flip(kernel, [0, 1])) + + out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1 + out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1 + ctx.out_size = (out_h, out_w) + + ctx.up = (up_x, up_y) + ctx.down = (down_x, down_y) + ctx.pad = (pad_x0, pad_x1, pad_y0, pad_y1) + + g_pad_x0 = kernel_w - pad_x0 - 1 + g_pad_y0 = kernel_h - pad_y0 - 1 + g_pad_x1 = in_w * up_x - out_w * down_x + pad_x0 - up_x + 1 + g_pad_y1 = in_h * up_y - out_h * down_y + pad_y0 - up_y + 1 + + ctx.g_pad = (g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1) + + out = upfirdn2d_ext.upfirdn2d( + input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1 + ) + # out = out.view(major, out_h, out_w, minor) + out = out.view(-1, channel, out_h, out_w) + + return out + + @staticmethod + def backward(ctx, grad_output): + kernel, grad_kernel = ctx.saved_tensors + + grad_input = UpFirDn2dBackward.apply( + grad_output, + kernel, + grad_kernel, + ctx.up, + ctx.down, + ctx.pad, + ctx.g_pad, + ctx.in_size, + ctx.out_size, + ) + + return grad_input, None, None, None, None + + +def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)): + if input.device.type == "cpu": + out = upfirdn2d_native( + input, kernel, up, up, down, down, pad[0], pad[1], pad[0], pad[1] + ) + else: + out = UpFirDn2d.apply( + input, kernel, (up, up), (down, down), (pad[0], pad[1], pad[0], pad[1]) + ) + + return out + + +def upfirdn2d_native( + input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1 +): + _, channel, in_h, in_w = input.shape + input = input.reshape(-1, in_h, in_w, 1) + + _, in_h, in_w, minor = input.shape + kernel_h, kernel_w = kernel.shape + + out = input.view(-1, in_h, 1, in_w, 1, minor) + out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1]) + out = out.view(-1, in_h * up_y, in_w * up_x, minor) + + out = F.pad( + out, [0, 0, max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)] + ) + out = out[ + :, + max(-pad_y0, 0) : out.shape[1] - max(-pad_y1, 0), + max(-pad_x0, 0) : out.shape[2] - max(-pad_x1, 0), + :, + ] + + out = out.permute(0, 3, 1, 2) + out = out.reshape( + [-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1] + ) + w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w) + out = F.conv2d(out, w) + out = out.reshape( + -1, + minor, + in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1, + in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1, + ) + out = out.permute(0, 2, 3, 1) + out = out[:, ::down_y, ::down_x, :] + + out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1 + out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1 + + return out.view(-1, channel, out_h, out_w) diff --git a/ldm_patched/pfn/architecture/timm/LICENSE b/ldm_patched/pfn/architecture/timm/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..b4e9438bd1e07e17abf58cfd86e536ec880348a3 --- /dev/null +++ b/ldm_patched/pfn/architecture/timm/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2019 Ross Wightman + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/ldm_patched/pfn/architecture/timm/__pycache__/drop.cpython-310.pyc b/ldm_patched/pfn/architecture/timm/__pycache__/drop.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b991ad636f224bf0d16ef255d2ce0c02f310297 Binary files /dev/null and b/ldm_patched/pfn/architecture/timm/__pycache__/drop.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/timm/__pycache__/helpers.cpython-310.pyc b/ldm_patched/pfn/architecture/timm/__pycache__/helpers.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e9da7efaf6b51d0d5c069e1febc7f9f75b937c8 Binary files /dev/null and b/ldm_patched/pfn/architecture/timm/__pycache__/helpers.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/timm/__pycache__/weight_init.cpython-310.pyc b/ldm_patched/pfn/architecture/timm/__pycache__/weight_init.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d3afc71c0ed21f103855e6c2112a9031ceb11669 Binary files /dev/null and b/ldm_patched/pfn/architecture/timm/__pycache__/weight_init.cpython-310.pyc differ diff --git a/ldm_patched/pfn/architecture/timm/drop.py b/ldm_patched/pfn/architecture/timm/drop.py new file mode 100644 index 0000000000000000000000000000000000000000..14f0da914b2a198af7e6124cd90bad6adaf8a84e --- /dev/null +++ b/ldm_patched/pfn/architecture/timm/drop.py @@ -0,0 +1,223 @@ +""" DropBlock, DropPath + +PyTorch implementations of DropBlock and DropPath (Stochastic Depth) regularization layers. + +Papers: +DropBlock: A regularization method for convolutional networks (https://arxiv.org/abs/1810.12890) + +Deep Networks with Stochastic Depth (https://arxiv.org/abs/1603.09382) + +Code: +DropBlock impl inspired by two Tensorflow impl that I liked: + - https://github.com/tensorflow/tpu/blob/master/models/official/resnet/resnet_model.py#L74 + - https://github.com/clovaai/assembled-cnn/blob/master/nets/blocks.py + +Hacked together by / Copyright 2020 Ross Wightman +""" +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def drop_block_2d( + x, + drop_prob: float = 0.1, + block_size: int = 7, + gamma_scale: float = 1.0, + with_noise: bool = False, + inplace: bool = False, + batchwise: bool = False, +): + """DropBlock. See https://arxiv.org/pdf/1810.12890.pdf + + DropBlock with an experimental gaussian noise option. This layer has been tested on a few training + runs with success, but needs further validation and possibly optimization for lower runtime impact. + """ + _, C, H, W = x.shape + total_size = W * H + clipped_block_size = min(block_size, min(W, H)) + # seed_drop_rate, the gamma parameter + gamma = ( + gamma_scale + * drop_prob + * total_size + / clipped_block_size**2 + / ((W - block_size + 1) * (H - block_size + 1)) + ) + + # Forces the block to be inside the feature map. + w_i, h_i = torch.meshgrid( + torch.arange(W).to(x.device), torch.arange(H).to(x.device) + ) + valid_block = ( + (w_i >= clipped_block_size // 2) & (w_i < W - (clipped_block_size - 1) // 2) + ) & ((h_i >= clipped_block_size // 2) & (h_i < H - (clipped_block_size - 1) // 2)) + valid_block = torch.reshape(valid_block, (1, 1, H, W)).to(dtype=x.dtype) + + if batchwise: + # one mask for whole batch, quite a bit faster + uniform_noise = torch.rand((1, C, H, W), dtype=x.dtype, device=x.device) + else: + uniform_noise = torch.rand_like(x) + block_mask = ((2 - gamma - valid_block + uniform_noise) >= 1).to(dtype=x.dtype) + block_mask = -F.max_pool2d( + -block_mask, + kernel_size=clipped_block_size, # block_size, + stride=1, + padding=clipped_block_size // 2, + ) + + if with_noise: + normal_noise = ( + torch.randn((1, C, H, W), dtype=x.dtype, device=x.device) + if batchwise + else torch.randn_like(x) + ) + if inplace: + x.mul_(block_mask).add_(normal_noise * (1 - block_mask)) + else: + x = x * block_mask + normal_noise * (1 - block_mask) + else: + normalize_scale = ( + block_mask.numel() / block_mask.to(dtype=torch.float32).sum().add(1e-7) + ).to(x.dtype) + if inplace: + x.mul_(block_mask * normalize_scale) + else: + x = x * block_mask * normalize_scale + return x + + +def drop_block_fast_2d( + x: torch.Tensor, + drop_prob: float = 0.1, + block_size: int = 7, + gamma_scale: float = 1.0, + with_noise: bool = False, + inplace: bool = False, +): + """DropBlock. See https://arxiv.org/pdf/1810.12890.pdf + + DropBlock with an experimental gaussian noise option. Simplied from above without concern for valid + block mask at edges. + """ + _, _, H, W = x.shape + total_size = W * H + clipped_block_size = min(block_size, min(W, H)) + gamma = ( + gamma_scale + * drop_prob + * total_size + / clipped_block_size**2 + / ((W - block_size + 1) * (H - block_size + 1)) + ) + + block_mask = torch.empty_like(x).bernoulli_(gamma) + block_mask = F.max_pool2d( + block_mask.to(x.dtype), + kernel_size=clipped_block_size, + stride=1, + padding=clipped_block_size // 2, + ) + + if with_noise: + normal_noise = torch.empty_like(x).normal_() + if inplace: + x.mul_(1.0 - block_mask).add_(normal_noise * block_mask) + else: + x = x * (1.0 - block_mask) + normal_noise * block_mask + else: + block_mask = 1 - block_mask + normalize_scale = ( + block_mask.numel() / block_mask.to(dtype=torch.float32).sum().add(1e-6) + ).to(dtype=x.dtype) + if inplace: + x.mul_(block_mask * normalize_scale) + else: + x = x * block_mask * normalize_scale + return x + + +class DropBlock2d(nn.Module): + """DropBlock. See https://arxiv.org/pdf/1810.12890.pdf""" + + def __init__( + self, + drop_prob: float = 0.1, + block_size: int = 7, + gamma_scale: float = 1.0, + with_noise: bool = False, + inplace: bool = False, + batchwise: bool = False, + fast: bool = True, + ): + super(DropBlock2d, self).__init__() + self.drop_prob = drop_prob + self.gamma_scale = gamma_scale + self.block_size = block_size + self.with_noise = with_noise + self.inplace = inplace + self.batchwise = batchwise + self.fast = fast # FIXME finish comparisons of fast vs not + + def forward(self, x): + if not self.training or not self.drop_prob: + return x + if self.fast: + return drop_block_fast_2d( + x, + self.drop_prob, + self.block_size, + self.gamma_scale, + self.with_noise, + self.inplace, + ) + else: + return drop_block_2d( + x, + self.drop_prob, + self.block_size, + self.gamma_scale, + self.with_noise, + self.inplace, + self.batchwise, + ) + + +def drop_path( + x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True +): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for + changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use + 'survival rate' as the argument. + + """ + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * ( + x.ndim - 1 + ) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0 and scale_by_keep: + random_tensor.div_(keep_prob) + return x * random_tensor + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) + + def extra_repr(self): + return f"drop_prob={round(self.drop_prob,3):0.3f}" diff --git a/ldm_patched/pfn/architecture/timm/helpers.py b/ldm_patched/pfn/architecture/timm/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..cdafee0709165dd992118e3b09b8d26f70ea8a2a --- /dev/null +++ b/ldm_patched/pfn/architecture/timm/helpers.py @@ -0,0 +1,31 @@ +""" Layer/Module Helpers +Hacked together by / Copyright 2020 Ross Wightman +""" +import collections.abc +from itertools import repeat + + +# From PyTorch internals +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable) and not isinstance(x, str): + return x + return tuple(repeat(x, n)) + + return parse + + +to_1tuple = _ntuple(1) +to_2tuple = _ntuple(2) +to_3tuple = _ntuple(3) +to_4tuple = _ntuple(4) +to_ntuple = _ntuple + + +def make_divisible(v, divisor=8, min_value=None, round_limit=0.9): + min_value = min_value or divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < round_limit * v: + new_v += divisor + return new_v diff --git a/ldm_patched/pfn/architecture/timm/weight_init.py b/ldm_patched/pfn/architecture/timm/weight_init.py new file mode 100644 index 0000000000000000000000000000000000000000..b0169774657d86c1946008e746f2f4f7e833a44c --- /dev/null +++ b/ldm_patched/pfn/architecture/timm/weight_init.py @@ -0,0 +1,128 @@ +import math +import warnings + +import torch +from torch.nn.init import _calculate_fan_in_and_fan_out + + +def _no_grad_trunc_normal_(tensor, mean, std, a, b): + # Cut & paste from PyTorch official master until it's in a few official releases - RW + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn( + "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2, + ) + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.0)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +def trunc_normal_( + tensor: torch.Tensor, mean=0.0, std=1.0, a=-2.0, b=2.0 +) -> torch.Tensor: + r"""Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \leq \text{mean} \leq b`. + + NOTE: this impl is similar to the PyTorch trunc_normal_, the bounds [a, b] are + applied while sampling the normal with mean/std applied, therefore a, b args + should be adjusted to match the range of mean, std args. + + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + Examples: + >>> w = torch.empty(3, 5) + >>> nn.init.trunc_normal_(w) + """ + return _no_grad_trunc_normal_(tensor, mean, std, a, b) + + +def trunc_normal_tf_( + tensor: torch.Tensor, mean=0.0, std=1.0, a=-2.0, b=2.0 +) -> torch.Tensor: + r"""Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \leq \text{mean} \leq b`. + + NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the + bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0 + and the result is subsquently scaled and shifted by the mean and std args. + + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + Examples: + >>> w = torch.empty(3, 5) + >>> nn.init.trunc_normal_(w) + """ + _no_grad_trunc_normal_(tensor, 0, 1.0, a, b) + with torch.no_grad(): + tensor.mul_(std).add_(mean) + return tensor + + +def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"): + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) + if mode == "fan_in": + denom = fan_in + elif mode == "fan_out": + denom = fan_out + elif mode == "fan_avg": + denom = (fan_in + fan_out) / 2 + + variance = scale / denom # type: ignore + + if distribution == "truncated_normal": + # constant is stddev of standard normal truncated to (-2, 2) + trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978) + elif distribution == "normal": + tensor.normal_(std=math.sqrt(variance)) + elif distribution == "uniform": + bound = math.sqrt(3 * variance) + # pylint: disable=invalid-unary-operand-type + tensor.uniform_(-bound, bound) + else: + raise ValueError(f"invalid distribution {distribution}") + + +def lecun_normal_(tensor): + variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal") diff --git a/ldm_patched/pfn/model_loading.py b/ldm_patched/pfn/model_loading.py new file mode 100644 index 0000000000000000000000000000000000000000..e000871c1bfe66a07dc13b51ad709cb0de092a41 --- /dev/null +++ b/ldm_patched/pfn/model_loading.py @@ -0,0 +1,99 @@ +import logging as logger + +from .architecture.DAT import DAT +from .architecture.face.codeformer import CodeFormer +from .architecture.face.gfpganv1_clean_arch import GFPGANv1Clean +from .architecture.face.restoreformer_arch import RestoreFormer +from .architecture.HAT import HAT +from .architecture.LaMa import LaMa +from .architecture.OmniSR.OmniSR import OmniSR +from .architecture.RRDB import RRDBNet as ESRGAN +from .architecture.SCUNet import SCUNet +from .architecture.SPSR import SPSRNet as SPSR +from .architecture.SRVGG import SRVGGNetCompact as RealESRGANv2 +from .architecture.SwiftSRGAN import Generator as SwiftSRGAN +from .architecture.Swin2SR import Swin2SR +from .architecture.SwinIR import SwinIR +from .types import PyTorchModel + + +class UnsupportedModel(Exception): + pass + + +def load_state_dict(state_dict) -> PyTorchModel: + logger.debug(f"Loading state dict into pytorch model arch") + + state_dict_keys = list(state_dict.keys()) + + if "params_ema" in state_dict_keys: + state_dict = state_dict["params_ema"] + elif "params-ema" in state_dict_keys: + state_dict = state_dict["params-ema"] + elif "params" in state_dict_keys: + state_dict = state_dict["params"] + + state_dict_keys = list(state_dict.keys()) + # SRVGGNet Real-ESRGAN (v2) + if "body.0.weight" in state_dict_keys and "body.1.weight" in state_dict_keys: + model = RealESRGANv2(state_dict) + # SPSR (ESRGAN with lots of extra layers) + elif "f_HR_conv1.0.weight" in state_dict: + model = SPSR(state_dict) + # Swift-SRGAN + elif ( + "model" in state_dict_keys + and "initial.cnn.depthwise.weight" in state_dict["model"].keys() + ): + model = SwiftSRGAN(state_dict) + # SwinIR, Swin2SR, HAT + elif "layers.0.residual_group.blocks.0.norm1.weight" in state_dict_keys: + if ( + "layers.0.residual_group.blocks.0.conv_block.cab.0.weight" + in state_dict_keys + ): + model = HAT(state_dict) + elif "patch_embed.proj.weight" in state_dict_keys: + model = Swin2SR(state_dict) + else: + model = SwinIR(state_dict) + # GFPGAN + elif ( + "toRGB.0.weight" in state_dict_keys + and "stylegan_decoder.style_mlp.1.weight" in state_dict_keys + ): + model = GFPGANv1Clean(state_dict) + # RestoreFormer + elif ( + "encoder.conv_in.weight" in state_dict_keys + and "encoder.down.0.block.0.norm1.weight" in state_dict_keys + ): + model = RestoreFormer(state_dict) + elif ( + "encoder.blocks.0.weight" in state_dict_keys + and "quantize.embedding.weight" in state_dict_keys + ): + model = CodeFormer(state_dict) + # LaMa + elif ( + "model.model.1.bn_l.running_mean" in state_dict_keys + or "generator.model.1.bn_l.running_mean" in state_dict_keys + ): + model = LaMa(state_dict) + # Omni-SR + elif "residual_layer.0.residual_layer.0.layer.0.fn.0.weight" in state_dict_keys: + model = OmniSR(state_dict) + # SCUNet + elif "m_head.0.weight" in state_dict_keys and "m_tail.0.weight" in state_dict_keys: + model = SCUNet(state_dict) + # DAT + elif "layers.0.blocks.2.attn.attn_mask_0" in state_dict_keys: + model = DAT(state_dict) + # Regular ESRGAN, "new-arch" ESRGAN, Real-ESRGAN v1 + else: + try: + model = ESRGAN(state_dict) + except: + # pylint: disable=raise-missing-from + raise UnsupportedModel + return model diff --git a/ldm_patched/pfn/types.py b/ldm_patched/pfn/types.py new file mode 100644 index 0000000000000000000000000000000000000000..193333b9e8049d9558ca2ea253d41ee44b0b294b --- /dev/null +++ b/ldm_patched/pfn/types.py @@ -0,0 +1,69 @@ +from typing import Union + +from .architecture.DAT import DAT +from .architecture.face.codeformer import CodeFormer +from .architecture.face.gfpganv1_clean_arch import GFPGANv1Clean +from .architecture.face.restoreformer_arch import RestoreFormer +from .architecture.HAT import HAT +from .architecture.LaMa import LaMa +from .architecture.OmniSR.OmniSR import OmniSR +from .architecture.RRDB import RRDBNet as ESRGAN +from .architecture.SCUNet import SCUNet +from .architecture.SPSR import SPSRNet as SPSR +from .architecture.SRVGG import SRVGGNetCompact as RealESRGANv2 +from .architecture.SwiftSRGAN import Generator as SwiftSRGAN +from .architecture.Swin2SR import Swin2SR +from .architecture.SwinIR import SwinIR + +PyTorchSRModels = ( + RealESRGANv2, + SPSR, + SwiftSRGAN, + ESRGAN, + SwinIR, + Swin2SR, + HAT, + OmniSR, + SCUNet, + DAT, +) +PyTorchSRModel = Union[ + RealESRGANv2, + SPSR, + SwiftSRGAN, + ESRGAN, + SwinIR, + Swin2SR, + HAT, + OmniSR, + SCUNet, + DAT, +] + + +def is_pytorch_sr_model(model: object): + return isinstance(model, PyTorchSRModels) + + +PyTorchFaceModels = (GFPGANv1Clean, RestoreFormer, CodeFormer) +PyTorchFaceModel = Union[GFPGANv1Clean, RestoreFormer, CodeFormer] + + +def is_pytorch_face_model(model: object): + return isinstance(model, PyTorchFaceModels) + + +PyTorchInpaintModels = (LaMa,) +PyTorchInpaintModel = Union[LaMa] + + +def is_pytorch_inpaint_model(model: object): + return isinstance(model, PyTorchInpaintModels) + + +PyTorchModels = (*PyTorchSRModels, *PyTorchFaceModels, *PyTorchInpaintModels) +PyTorchModel = Union[PyTorchSRModel, PyTorchFaceModel, PyTorchInpaintModel] + + +def is_pytorch_model(model: object): + return isinstance(model, PyTorchModels) diff --git a/ldm_patched/t2ia/__pycache__/adapter.cpython-310.pyc b/ldm_patched/t2ia/__pycache__/adapter.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a0097c94d2de4a217893926c104b19c89211515c Binary files /dev/null and b/ldm_patched/t2ia/__pycache__/adapter.cpython-310.pyc differ diff --git a/ldm_patched/t2ia/adapter.py b/ldm_patched/t2ia/adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..e9a606b1cd67fd9a955a0ea0a86d1bd5498d85e5 --- /dev/null +++ b/ldm_patched/t2ia/adapter.py @@ -0,0 +1,293 @@ +#taken from https://github.com/TencentARC/T2I-Adapter +import torch +import torch.nn as nn +from collections import OrderedDict + + +def conv_nd(dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D convolution module. + """ + if dims == 1: + return nn.Conv1d(*args, **kwargs) + elif dims == 2: + return nn.Conv2d(*args, **kwargs) + elif dims == 3: + return nn.Conv3d(*args, **kwargs) + raise ValueError(f"unsupported dimensions: {dims}") + + +def avg_pool_nd(dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D average pooling module. + """ + if dims == 1: + return nn.AvgPool1d(*args, **kwargs) + elif dims == 2: + return nn.AvgPool2d(*args, **kwargs) + elif dims == 3: + return nn.AvgPool3d(*args, **kwargs) + raise ValueError(f"unsupported dimensions: {dims}") + + +class Downsample(nn.Module): + """ + A downsampling layer with an optional convolution. + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then + downsampling occurs in the inner-two dimensions. + """ + + def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + stride = 2 if dims != 3 else (1, 2, 2) + if use_conv: + self.op = conv_nd( + dims, self.channels, self.out_channels, 3, stride=stride, padding=padding + ) + else: + assert self.channels == self.out_channels + self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride) + + def forward(self, x): + assert x.shape[1] == self.channels + if not self.use_conv: + padding = [x.shape[2] % 2, x.shape[3] % 2] + self.op.padding = padding + + x = self.op(x) + return x + + +class ResnetBlock(nn.Module): + def __init__(self, in_c, out_c, down, ksize=3, sk=False, use_conv=True): + super().__init__() + ps = ksize // 2 + if in_c != out_c or sk == False: + self.in_conv = nn.Conv2d(in_c, out_c, ksize, 1, ps) + else: + # print('n_in') + self.in_conv = None + self.block1 = nn.Conv2d(out_c, out_c, 3, 1, 1) + self.act = nn.ReLU() + self.block2 = nn.Conv2d(out_c, out_c, ksize, 1, ps) + if sk == False: + self.skep = nn.Conv2d(in_c, out_c, ksize, 1, ps) + else: + self.skep = None + + self.down = down + if self.down == True: + self.down_opt = Downsample(in_c, use_conv=use_conv) + + def forward(self, x): + if self.down == True: + x = self.down_opt(x) + if self.in_conv is not None: # edit + x = self.in_conv(x) + + h = self.block1(x) + h = self.act(h) + h = self.block2(h) + if self.skep is not None: + return h + self.skep(x) + else: + return h + x + + +class Adapter(nn.Module): + def __init__(self, channels=[320, 640, 1280, 1280], nums_rb=3, cin=64, ksize=3, sk=False, use_conv=True, xl=True): + super(Adapter, self).__init__() + self.unshuffle_amount = 8 + resblock_no_downsample = [] + resblock_downsample = [3, 2, 1] + self.xl = xl + if self.xl: + self.unshuffle_amount = 16 + resblock_no_downsample = [1] + resblock_downsample = [2] + + self.input_channels = cin // (self.unshuffle_amount * self.unshuffle_amount) + self.unshuffle = nn.PixelUnshuffle(self.unshuffle_amount) + self.channels = channels + self.nums_rb = nums_rb + self.body = [] + for i in range(len(channels)): + for j in range(nums_rb): + if (i in resblock_downsample) and (j == 0): + self.body.append( + ResnetBlock(channels[i - 1], channels[i], down=True, ksize=ksize, sk=sk, use_conv=use_conv)) + elif (i in resblock_no_downsample) and (j == 0): + self.body.append( + ResnetBlock(channels[i - 1], channels[i], down=False, ksize=ksize, sk=sk, use_conv=use_conv)) + else: + self.body.append( + ResnetBlock(channels[i], channels[i], down=False, ksize=ksize, sk=sk, use_conv=use_conv)) + self.body = nn.ModuleList(self.body) + self.conv_in = nn.Conv2d(cin, channels[0], 3, 1, 1) + + def forward(self, x): + # unshuffle + x = self.unshuffle(x) + # extract features + features = [] + x = self.conv_in(x) + for i in range(len(self.channels)): + for j in range(self.nums_rb): + idx = i * self.nums_rb + j + x = self.body[idx](x) + if self.xl: + features.append(None) + if i == 0: + features.append(None) + features.append(None) + if i == 2: + features.append(None) + else: + features.append(None) + features.append(None) + features.append(x) + + return features + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + ret = super().forward(x.type(torch.float32)) + return ret.type(orig_type) + + +class QuickGELU(nn.Module): + + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class ResidualAttentionBlock(nn.Module): + + def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None): + super().__init__() + + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = LayerNorm(d_model) + self.mlp = nn.Sequential( + OrderedDict([("c_fc", nn.Linear(d_model, d_model * 4)), ("gelu", QuickGELU()), + ("c_proj", nn.Linear(d_model * 4, d_model))])) + self.ln_2 = LayerNorm(d_model) + self.attn_mask = attn_mask + + def attention(self, x: torch.Tensor): + self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None + return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] + + def forward(self, x: torch.Tensor): + x = x + self.attention(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x + + +class StyleAdapter(nn.Module): + + def __init__(self, width=1024, context_dim=768, num_head=8, n_layes=3, num_token=4): + super().__init__() + + scale = width ** -0.5 + self.transformer_layes = nn.Sequential(*[ResidualAttentionBlock(width, num_head) for _ in range(n_layes)]) + self.num_token = num_token + self.style_embedding = nn.Parameter(torch.randn(1, num_token, width) * scale) + self.ln_post = LayerNorm(width) + self.ln_pre = LayerNorm(width) + self.proj = nn.Parameter(scale * torch.randn(width, context_dim)) + + def forward(self, x): + # x shape [N, HW+1, C] + style_embedding = self.style_embedding + torch.zeros( + (x.shape[0], self.num_token, self.style_embedding.shape[-1]), device=x.device) + x = torch.cat([x, style_embedding], dim=1) + x = self.ln_pre(x) + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer_layes(x) + x = x.permute(1, 0, 2) # LND -> NLD + + x = self.ln_post(x[:, -self.num_token:, :]) + x = x @ self.proj + + return x + + +class ResnetBlock_light(nn.Module): + def __init__(self, in_c): + super().__init__() + self.block1 = nn.Conv2d(in_c, in_c, 3, 1, 1) + self.act = nn.ReLU() + self.block2 = nn.Conv2d(in_c, in_c, 3, 1, 1) + + def forward(self, x): + h = self.block1(x) + h = self.act(h) + h = self.block2(h) + + return h + x + + +class extractor(nn.Module): + def __init__(self, in_c, inter_c, out_c, nums_rb, down=False): + super().__init__() + self.in_conv = nn.Conv2d(in_c, inter_c, 1, 1, 0) + self.body = [] + for _ in range(nums_rb): + self.body.append(ResnetBlock_light(inter_c)) + self.body = nn.Sequential(*self.body) + self.out_conv = nn.Conv2d(inter_c, out_c, 1, 1, 0) + self.down = down + if self.down == True: + self.down_opt = Downsample(in_c, use_conv=False) + + def forward(self, x): + if self.down == True: + x = self.down_opt(x) + x = self.in_conv(x) + x = self.body(x) + x = self.out_conv(x) + + return x + + +class Adapter_light(nn.Module): + def __init__(self, channels=[320, 640, 1280, 1280], nums_rb=3, cin=64): + super(Adapter_light, self).__init__() + self.unshuffle_amount = 8 + self.unshuffle = nn.PixelUnshuffle(self.unshuffle_amount) + self.input_channels = cin // (self.unshuffle_amount * self.unshuffle_amount) + self.channels = channels + self.nums_rb = nums_rb + self.body = [] + self.xl = False + + for i in range(len(channels)): + if i == 0: + self.body.append(extractor(in_c=cin, inter_c=channels[i]//4, out_c=channels[i], nums_rb=nums_rb, down=False)) + else: + self.body.append(extractor(in_c=channels[i-1], inter_c=channels[i]//4, out_c=channels[i], nums_rb=nums_rb, down=True)) + self.body = nn.ModuleList(self.body) + + def forward(self, x): + # unshuffle + x = self.unshuffle(x) + # extract features + features = [] + for i in range(len(self.channels)): + x = self.body[i](x) + features.append(None) + features.append(None) + features.append(x) + + return features diff --git a/ldm_patched/taesd/__pycache__/taesd.cpython-310.pyc b/ldm_patched/taesd/__pycache__/taesd.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b9971002e26bb07d1a8fafee039af995813caedd Binary files /dev/null and b/ldm_patched/taesd/__pycache__/taesd.cpython-310.pyc differ diff --git a/ldm_patched/taesd/taesd.py b/ldm_patched/taesd/taesd.py new file mode 100644 index 0000000000000000000000000000000000000000..0b4b885f74bd9d7f1cd2560b9f07daa579e634a3 --- /dev/null +++ b/ldm_patched/taesd/taesd.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +""" +Tiny AutoEncoder for Stable Diffusion +(DNN for encoding / decoding SD's latent space) +""" +import torch +import torch.nn as nn + +import ldm_patched.modules.utils +import ldm_patched.modules.ops + +def conv(n_in, n_out, **kwargs): + return ldm_patched.modules.ops.disable_weight_init.Conv2d(n_in, n_out, 3, padding=1, **kwargs) + +class Clamp(nn.Module): + def forward(self, x): + return torch.tanh(x / 3) * 3 + +class Block(nn.Module): + def __init__(self, n_in, n_out): + super().__init__() + self.conv = nn.Sequential(conv(n_in, n_out), nn.ReLU(), conv(n_out, n_out), nn.ReLU(), conv(n_out, n_out)) + self.skip = ldm_patched.modules.ops.disable_weight_init.Conv2d(n_in, n_out, 1, bias=False) if n_in != n_out else nn.Identity() + self.fuse = nn.ReLU() + def forward(self, x): + return self.fuse(self.conv(x) + self.skip(x)) + +def Encoder(): + return nn.Sequential( + conv(3, 64), Block(64, 64), + conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64), + conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64), + conv(64, 64, stride=2, bias=False), Block(64, 64), Block(64, 64), Block(64, 64), + conv(64, 4), + ) + +def Decoder(): + return nn.Sequential( + Clamp(), conv(4, 64), nn.ReLU(), + Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False), + Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False), + Block(64, 64), Block(64, 64), Block(64, 64), nn.Upsample(scale_factor=2), conv(64, 64, bias=False), + Block(64, 64), conv(64, 3), + ) + +class TAESD(nn.Module): + latent_magnitude = 3 + latent_shift = 0.5 + + def __init__(self, encoder_path=None, decoder_path=None): + """Initialize pretrained TAESD on the given device from the given checkpoints.""" + super().__init__() + self.taesd_encoder = Encoder() + self.taesd_decoder = Decoder() + self.vae_scale = torch.nn.Parameter(torch.tensor(1.0)) + if encoder_path is not None: + self.taesd_encoder.load_state_dict(ldm_patched.modules.utils.load_torch_file(encoder_path, safe_load=True)) + if decoder_path is not None: + self.taesd_decoder.load_state_dict(ldm_patched.modules.utils.load_torch_file(decoder_path, safe_load=True)) + + @staticmethod + def scale_latents(x): + """raw latents -> [0, 1]""" + return x.div(2 * TAESD.latent_magnitude).add(TAESD.latent_shift).clamp(0, 1) + + @staticmethod + def unscale_latents(x): + """[0, 1] -> raw latents""" + return x.sub(TAESD.latent_shift).mul(2 * TAESD.latent_magnitude) + + def decode(self, x): + x_sample = self.taesd_decoder(x * self.vae_scale) + x_sample = x_sample.sub(0.5).mul(2) + return x_sample + + def encode(self, x): + return self.taesd_encoder(x * 0.5 + 0.5) / self.vae_scale diff --git a/ldm_patched/unipc/__pycache__/uni_pc.cpython-310.pyc b/ldm_patched/unipc/__pycache__/uni_pc.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e25d65c1851c47da6bec4e4b9853995550af6b1 Binary files /dev/null and b/ldm_patched/unipc/__pycache__/uni_pc.cpython-310.pyc differ diff --git a/ldm_patched/unipc/uni_pc.py b/ldm_patched/unipc/uni_pc.py new file mode 100644 index 0000000000000000000000000000000000000000..08bf0fc9e6787aec84500b4e3d24a4c8d253b433 --- /dev/null +++ b/ldm_patched/unipc/uni_pc.py @@ -0,0 +1,894 @@ +#code taken from: https://github.com/wl-zhao/UniPC and modified + +import torch +import torch.nn.functional as F +import math + +from tqdm.auto import trange, tqdm + + +class NoiseScheduleVP: + def __init__( + self, + schedule='discrete', + betas=None, + alphas_cumprod=None, + continuous_beta_0=0.1, + continuous_beta_1=20., + ): + """Create a wrapper class for the forward SDE (VP type). + + *** + Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t. + We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images. + *** + + The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ). + We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper). + Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have: + + log_alpha_t = self.marginal_log_mean_coeff(t) + sigma_t = self.marginal_std(t) + lambda_t = self.marginal_lambda(t) + + Moreover, as lambda(t) is an invertible function, we also support its inverse function: + + t = self.inverse_lambda(lambda_t) + + =============================================================== + + We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]). + + 1. For discrete-time DPMs: + + For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by: + t_i = (i + 1) / N + e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1. + We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3. + + Args: + betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details) + alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details) + + Note that we always have alphas_cumprod = cumprod(betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`. + + **Important**: Please pay special attention for the args for `alphas_cumprod`: + The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that + q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ). + Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have + alpha_{t_n} = \sqrt{\hat{alpha_n}}, + and + log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}). + + + 2. For continuous-time DPMs: + + We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise + schedule are the default settings in DDPM and improved-DDPM: + + Args: + beta_min: A `float` number. The smallest beta for the linear schedule. + beta_max: A `float` number. The largest beta for the linear schedule. + cosine_s: A `float` number. The hyperparameter in the cosine schedule. + cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule. + T: A `float` number. The ending time of the forward process. + + =============================================================== + + Args: + schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs, + 'linear' or 'cosine' for continuous-time DPMs. + Returns: + A wrapper object of the forward SDE (VP type). + + =============================================================== + + Example: + + # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1): + >>> ns = NoiseScheduleVP('discrete', betas=betas) + + # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1): + >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod) + + # For continuous-time DPMs (VPSDE), linear schedule: + >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.) + + """ + + if schedule not in ['discrete', 'linear', 'cosine']: + raise ValueError("Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format(schedule)) + + self.schedule = schedule + if schedule == 'discrete': + if betas is not None: + log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0) + else: + assert alphas_cumprod is not None + log_alphas = 0.5 * torch.log(alphas_cumprod) + self.total_N = len(log_alphas) + self.T = 1. + self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1)) + self.log_alpha_array = log_alphas.reshape((1, -1,)) + else: + self.total_N = 1000 + self.beta_0 = continuous_beta_0 + self.beta_1 = continuous_beta_1 + self.cosine_s = 0.008 + self.cosine_beta_max = 999. + self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * (1. + self.cosine_s) / math.pi - self.cosine_s + self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.)) + self.schedule = schedule + if schedule == 'cosine': + # For the cosine schedule, T = 1 will have numerical issues. So we manually set the ending time T. + # Note that T = 0.9946 may be not the optimal setting. However, we find it works well. + self.T = 0.9946 + else: + self.T = 1. + + def marginal_log_mean_coeff(self, t): + """ + Compute log(alpha_t) of a given continuous-time label t in [0, T]. + """ + if self.schedule == 'discrete': + return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device)).reshape((-1)) + elif self.schedule == 'linear': + return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0 + elif self.schedule == 'cosine': + log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.)) + log_alpha_t = log_alpha_fn(t) - self.cosine_log_alpha_0 + return log_alpha_t + + def marginal_alpha(self, t): + """ + Compute alpha_t of a given continuous-time label t in [0, T]. + """ + return torch.exp(self.marginal_log_mean_coeff(t)) + + def marginal_std(self, t): + """ + Compute sigma_t of a given continuous-time label t in [0, T]. + """ + return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t))) + + def marginal_lambda(self, t): + """ + Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T]. + """ + log_mean_coeff = self.marginal_log_mean_coeff(t) + log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff)) + return log_mean_coeff - log_std + + def inverse_lambda(self, lamb): + """ + Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t. + """ + if self.schedule == 'linear': + tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb)) + Delta = self.beta_0**2 + tmp + return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0) + elif self.schedule == 'discrete': + log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb) + t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]), torch.flip(self.t_array.to(lamb.device), [1])) + return t.reshape((-1,)) + else: + log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb)) + t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (1. + self.cosine_s) / math.pi - self.cosine_s + t = t_fn(log_alpha) + return t + + +def model_wrapper( + model, + noise_schedule, + model_type="noise", + model_kwargs={}, + guidance_type="uncond", + condition=None, + unconditional_condition=None, + guidance_scale=1., + classifier_fn=None, + classifier_kwargs={}, +): + """Create a wrapper function for the noise prediction model. + + DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to + firstly wrap the model function to a noise prediction model that accepts the continuous time as the input. + + We support four types of the diffusion model by setting `model_type`: + + 1. "noise": noise prediction model. (Trained by predicting noise). + + 2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0). + + 3. "v": velocity prediction model. (Trained by predicting the velocity). + The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2]. + + [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models." + arXiv preprint arXiv:2202.00512 (2022). + [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models." + arXiv preprint arXiv:2210.02303 (2022). + + 4. "score": marginal score function. (Trained by denoising score matching). + Note that the score function and the noise prediction model follows a simple relationship: + ``` + noise(x_t, t) = -sigma_t * score(x_t, t) + ``` + + We support three types of guided sampling by DPMs by setting `guidance_type`: + 1. "uncond": unconditional sampling by DPMs. + The input `model` has the following format: + `` + model(x, t_input, **model_kwargs) -> noise | x_start | v | score + `` + + 2. "classifier": classifier guidance sampling [3] by DPMs and another classifier. + The input `model` has the following format: + `` + model(x, t_input, **model_kwargs) -> noise | x_start | v | score + `` + + The input `classifier_fn` has the following format: + `` + classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond) + `` + + [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis," + in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794. + + 3. "classifier-free": classifier-free guidance sampling by conditional DPMs. + The input `model` has the following format: + `` + model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score + `` + And if cond == `unconditional_condition`, the model output is the unconditional DPM output. + + [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance." + arXiv preprint arXiv:2207.12598 (2022). + + + The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999) + or continuous-time labels (i.e. epsilon to T). + + We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise: + `` + def model_fn(x, t_continuous) -> noise: + t_input = get_model_input_time(t_continuous) + return noise_pred(model, x, t_input, **model_kwargs) + `` + where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver. + + =============================================================== + + Args: + model: A diffusion model with the corresponding format described above. + noise_schedule: A noise schedule object, such as NoiseScheduleVP. + model_type: A `str`. The parameterization type of the diffusion model. + "noise" or "x_start" or "v" or "score". + model_kwargs: A `dict`. A dict for the other inputs of the model function. + guidance_type: A `str`. The type of the guidance for sampling. + "uncond" or "classifier" or "classifier-free". + condition: A pytorch tensor. The condition for the guided sampling. + Only used for "classifier" or "classifier-free" guidance type. + unconditional_condition: A pytorch tensor. The condition for the unconditional sampling. + Only used for "classifier-free" guidance type. + guidance_scale: A `float`. The scale for the guided sampling. + classifier_fn: A classifier function. Only used for the classifier guidance. + classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function. + Returns: + A noise prediction model that accepts the noised data and the continuous time as the inputs. + """ + + def get_model_input_time(t_continuous): + """ + Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time. + For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N]. + For continuous-time DPMs, we just use `t_continuous`. + """ + if noise_schedule.schedule == 'discrete': + return (t_continuous - 1. / noise_schedule.total_N) * 1000. + else: + return t_continuous + + def noise_pred_fn(x, t_continuous, cond=None): + if t_continuous.reshape((-1,)).shape[0] == 1: + t_continuous = t_continuous.expand((x.shape[0])) + t_input = get_model_input_time(t_continuous) + output = model(x, t_input, **model_kwargs) + if model_type == "noise": + return output + elif model_type == "x_start": + alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous) + dims = x.dim() + return (x - expand_dims(alpha_t, dims) * output) / expand_dims(sigma_t, dims) + elif model_type == "v": + alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous) + dims = x.dim() + return expand_dims(alpha_t, dims) * output + expand_dims(sigma_t, dims) * x + elif model_type == "score": + sigma_t = noise_schedule.marginal_std(t_continuous) + dims = x.dim() + return -expand_dims(sigma_t, dims) * output + + def cond_grad_fn(x, t_input): + """ + Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t). + """ + with torch.enable_grad(): + x_in = x.detach().requires_grad_(True) + log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs) + return torch.autograd.grad(log_prob.sum(), x_in)[0] + + def model_fn(x, t_continuous): + """ + The noise predicition model function that is used for DPM-Solver. + """ + if t_continuous.reshape((-1,)).shape[0] == 1: + t_continuous = t_continuous.expand((x.shape[0])) + if guidance_type == "uncond": + return noise_pred_fn(x, t_continuous) + elif guidance_type == "classifier": + assert classifier_fn is not None + t_input = get_model_input_time(t_continuous) + cond_grad = cond_grad_fn(x, t_input) + sigma_t = noise_schedule.marginal_std(t_continuous) + noise = noise_pred_fn(x, t_continuous) + return noise - guidance_scale * expand_dims(sigma_t, dims=cond_grad.dim()) * cond_grad + elif guidance_type == "classifier-free": + if guidance_scale == 1. or unconditional_condition is None: + return noise_pred_fn(x, t_continuous, cond=condition) + else: + x_in = torch.cat([x] * 2) + t_in = torch.cat([t_continuous] * 2) + c_in = torch.cat([unconditional_condition, condition]) + noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2) + return noise_uncond + guidance_scale * (noise - noise_uncond) + + assert model_type in ["noise", "x_start", "v"] + assert guidance_type in ["uncond", "classifier", "classifier-free"] + return model_fn + + +class UniPC: + def __init__( + self, + model_fn, + noise_schedule, + predict_x0=True, + thresholding=False, + max_val=1., + variant='bh1', + noise_mask=None, + masked_image=None, + noise=None, + ): + """Construct a UniPC. + + We support both data_prediction and noise_prediction. + """ + self.model = model_fn + self.noise_schedule = noise_schedule + self.variant = variant + self.predict_x0 = predict_x0 + self.thresholding = thresholding + self.max_val = max_val + self.noise_mask = noise_mask + self.masked_image = masked_image + self.noise = noise + + def dynamic_thresholding_fn(self, x0, t=None): + """ + The dynamic thresholding method. + """ + dims = x0.dim() + p = self.dynamic_thresholding_ratio + s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1) + s = expand_dims(torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)), dims) + x0 = torch.clamp(x0, -s, s) / s + return x0 + + def noise_prediction_fn(self, x, t): + """ + Return the noise prediction model. + """ + if self.noise_mask is not None: + return self.model(x, t) * self.noise_mask + else: + return self.model(x, t) + + def data_prediction_fn(self, x, t): + """ + Return the data prediction model (with thresholding). + """ + noise = self.noise_prediction_fn(x, t) + dims = x.dim() + alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t) + x0 = (x - expand_dims(sigma_t, dims) * noise) / expand_dims(alpha_t, dims) + if self.thresholding: + p = 0.995 # A hyperparameter in the paper of "Imagen" [1]. + s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1) + s = expand_dims(torch.maximum(s, self.max_val * torch.ones_like(s).to(s.device)), dims) + x0 = torch.clamp(x0, -s, s) / s + if self.noise_mask is not None: + x0 = x0 * self.noise_mask + (1. - self.noise_mask) * self.masked_image + return x0 + + def model_fn(self, x, t): + """ + Convert the model to the noise prediction model or the data prediction model. + """ + if self.predict_x0: + return self.data_prediction_fn(x, t) + else: + return self.noise_prediction_fn(x, t) + + def get_time_steps(self, skip_type, t_T, t_0, N, device): + """Compute the intermediate time steps for sampling. + """ + if skip_type == 'logSNR': + lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device)) + lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device)) + logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device) + return self.noise_schedule.inverse_lambda(logSNR_steps) + elif skip_type == 'time_uniform': + return torch.linspace(t_T, t_0, N + 1).to(device) + elif skip_type == 'time_quadratic': + t_order = 2 + t = torch.linspace(t_T**(1. / t_order), t_0**(1. / t_order), N + 1).pow(t_order).to(device) + return t + else: + raise ValueError("Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type)) + + def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device): + """ + Get the order of each step for sampling by the singlestep DPM-Solver. + """ + if order == 3: + K = steps // 3 + 1 + if steps % 3 == 0: + orders = [3,] * (K - 2) + [2, 1] + elif steps % 3 == 1: + orders = [3,] * (K - 1) + [1] + else: + orders = [3,] * (K - 1) + [2] + elif order == 2: + if steps % 2 == 0: + K = steps // 2 + orders = [2,] * K + else: + K = steps // 2 + 1 + orders = [2,] * (K - 1) + [1] + elif order == 1: + K = steps + orders = [1,] * steps + else: + raise ValueError("'order' must be '1' or '2' or '3'.") + if skip_type == 'logSNR': + # To reproduce the results in DPM-Solver paper + timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device) + else: + timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[torch.cumsum(torch.tensor([0,] + orders), 0).to(device)] + return timesteps_outer, orders + + def denoise_to_zero_fn(self, x, s): + """ + Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization. + """ + return self.data_prediction_fn(x, s) + + def multistep_uni_pc_update(self, x, model_prev_list, t_prev_list, t, order, **kwargs): + if len(t.shape) == 0: + t = t.view(-1) + if 'bh' in self.variant: + return self.multistep_uni_pc_bh_update(x, model_prev_list, t_prev_list, t, order, **kwargs) + else: + assert self.variant == 'vary_coeff' + return self.multistep_uni_pc_vary_update(x, model_prev_list, t_prev_list, t, order, **kwargs) + + def multistep_uni_pc_vary_update(self, x, model_prev_list, t_prev_list, t, order, use_corrector=True): + print(f'using unified predictor-corrector with order {order} (solver type: vary coeff)') + ns = self.noise_schedule + assert order <= len(model_prev_list) + + # first compute rks + t_prev_0 = t_prev_list[-1] + lambda_prev_0 = ns.marginal_lambda(t_prev_0) + lambda_t = ns.marginal_lambda(t) + model_prev_0 = model_prev_list[-1] + sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t) + log_alpha_t = ns.marginal_log_mean_coeff(t) + alpha_t = torch.exp(log_alpha_t) + + h = lambda_t - lambda_prev_0 + + rks = [] + D1s = [] + for i in range(1, order): + t_prev_i = t_prev_list[-(i + 1)] + model_prev_i = model_prev_list[-(i + 1)] + lambda_prev_i = ns.marginal_lambda(t_prev_i) + rk = (lambda_prev_i - lambda_prev_0) / h + rks.append(rk) + D1s.append((model_prev_i - model_prev_0) / rk) + + rks.append(1.) + rks = torch.tensor(rks, device=x.device) + + K = len(rks) + # build C matrix + C = [] + + col = torch.ones_like(rks) + for k in range(1, K + 1): + C.append(col) + col = col * rks / (k + 1) + C = torch.stack(C, dim=1) + + if len(D1s) > 0: + D1s = torch.stack(D1s, dim=1) # (B, K) + C_inv_p = torch.linalg.inv(C[:-1, :-1]) + A_p = C_inv_p + + if use_corrector: + print('using corrector') + C_inv = torch.linalg.inv(C) + A_c = C_inv + + hh = -h if self.predict_x0 else h + h_phi_1 = torch.expm1(hh) + h_phi_ks = [] + factorial_k = 1 + h_phi_k = h_phi_1 + for k in range(1, K + 2): + h_phi_ks.append(h_phi_k) + h_phi_k = h_phi_k / hh - 1 / factorial_k + factorial_k *= (k + 1) + + model_t = None + if self.predict_x0: + x_t_ = ( + sigma_t / sigma_prev_0 * x + - alpha_t * h_phi_1 * model_prev_0 + ) + # now predictor + x_t = x_t_ + if len(D1s) > 0: + # compute the residuals for predictor + for k in range(K - 1): + x_t = x_t - alpha_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_p[k]) + # now corrector + if use_corrector: + model_t = self.model_fn(x_t, t) + D1_t = (model_t - model_prev_0) + x_t = x_t_ + k = 0 + for k in range(K - 1): + x_t = x_t - alpha_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_c[k][:-1]) + x_t = x_t - alpha_t * h_phi_ks[K] * (D1_t * A_c[k][-1]) + else: + log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t) + x_t_ = ( + (torch.exp(log_alpha_t - log_alpha_prev_0)) * x + - (sigma_t * h_phi_1) * model_prev_0 + ) + # now predictor + x_t = x_t_ + if len(D1s) > 0: + # compute the residuals for predictor + for k in range(K - 1): + x_t = x_t - sigma_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_p[k]) + # now corrector + if use_corrector: + model_t = self.model_fn(x_t, t) + D1_t = (model_t - model_prev_0) + x_t = x_t_ + k = 0 + for k in range(K - 1): + x_t = x_t - sigma_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_c[k][:-1]) + x_t = x_t - sigma_t * h_phi_ks[K] * (D1_t * A_c[k][-1]) + return x_t, model_t + + def multistep_uni_pc_bh_update(self, x, model_prev_list, t_prev_list, t, order, x_t=None, use_corrector=True): + # print(f'using unified predictor-corrector with order {order} (solver type: B(h))') + ns = self.noise_schedule + assert order <= len(model_prev_list) + dims = x.dim() + + # first compute rks + t_prev_0 = t_prev_list[-1] + lambda_prev_0 = ns.marginal_lambda(t_prev_0) + lambda_t = ns.marginal_lambda(t) + model_prev_0 = model_prev_list[-1] + sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t) + log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t) + alpha_t = torch.exp(log_alpha_t) + + h = lambda_t - lambda_prev_0 + + rks = [] + D1s = [] + for i in range(1, order): + t_prev_i = t_prev_list[-(i + 1)] + model_prev_i = model_prev_list[-(i + 1)] + lambda_prev_i = ns.marginal_lambda(t_prev_i) + rk = ((lambda_prev_i - lambda_prev_0) / h)[0] + rks.append(rk) + D1s.append((model_prev_i - model_prev_0) / rk) + + rks.append(1.) + rks = torch.tensor(rks, device=x.device) + + R = [] + b = [] + + hh = -h[0] if self.predict_x0 else h[0] + h_phi_1 = torch.expm1(hh) # h\phi_1(h) = e^h - 1 + h_phi_k = h_phi_1 / hh - 1 + + factorial_i = 1 + + if self.variant == 'bh1': + B_h = hh + elif self.variant == 'bh2': + B_h = torch.expm1(hh) + else: + raise NotImplementedError() + + for i in range(1, order + 1): + R.append(torch.pow(rks, i - 1)) + b.append(h_phi_k * factorial_i / B_h) + factorial_i *= (i + 1) + h_phi_k = h_phi_k / hh - 1 / factorial_i + + R = torch.stack(R) + b = torch.tensor(b, device=x.device) + + # now predictor + use_predictor = len(D1s) > 0 and x_t is None + if len(D1s) > 0: + D1s = torch.stack(D1s, dim=1) # (B, K) + if x_t is None: + # for order 2, we use a simplified version + if order == 2: + rhos_p = torch.tensor([0.5], device=b.device) + else: + rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1]) + else: + D1s = None + + if use_corrector: + # print('using corrector') + # for order 1, we use a simplified version + if order == 1: + rhos_c = torch.tensor([0.5], device=b.device) + else: + rhos_c = torch.linalg.solve(R, b) + + model_t = None + if self.predict_x0: + x_t_ = ( + expand_dims(sigma_t / sigma_prev_0, dims) * x + - expand_dims(alpha_t * h_phi_1, dims)* model_prev_0 + ) + + if x_t is None: + if use_predictor: + pred_res = torch.einsum('k,bkchw->bchw', rhos_p, D1s) + else: + pred_res = 0 + x_t = x_t_ - expand_dims(alpha_t * B_h, dims) * pred_res + + if use_corrector: + model_t = self.model_fn(x_t, t) + if D1s is not None: + corr_res = torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s) + else: + corr_res = 0 + D1_t = (model_t - model_prev_0) + x_t = x_t_ - expand_dims(alpha_t * B_h, dims) * (corr_res + rhos_c[-1] * D1_t) + else: + x_t_ = ( + expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x + - expand_dims(sigma_t * h_phi_1, dims) * model_prev_0 + ) + if x_t is None: + if use_predictor: + pred_res = torch.einsum('k,bkchw->bchw', rhos_p, D1s) + else: + pred_res = 0 + x_t = x_t_ - expand_dims(sigma_t * B_h, dims) * pred_res + + if use_corrector: + model_t = self.model_fn(x_t, t) + if D1s is not None: + corr_res = torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s) + else: + corr_res = 0 + D1_t = (model_t - model_prev_0) + x_t = x_t_ - expand_dims(sigma_t * B_h, dims) * (corr_res + rhos_c[-1] * D1_t) + return x_t, model_t + + + def sample(self, x, timesteps, t_start=None, t_end=None, order=3, skip_type='time_uniform', + method='singlestep', lower_order_final=True, denoise_to_zero=False, solver_type='dpm_solver', + atol=0.0078, rtol=0.05, corrector=False, callback=None, disable_pbar=False + ): + # t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end + # t_T = self.noise_schedule.T if t_start is None else t_start + device = x.device + steps = len(timesteps) - 1 + if method == 'multistep': + assert steps >= order + # timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device) + assert timesteps.shape[0] - 1 == steps + # with torch.no_grad(): + for step_index in trange(steps, disable=disable_pbar): + if self.noise_mask is not None: + x = x * self.noise_mask + (1. - self.noise_mask) * (self.masked_image * self.noise_schedule.marginal_alpha(timesteps[step_index]) + self.noise * self.noise_schedule.marginal_std(timesteps[step_index])) + if step_index == 0: + vec_t = timesteps[0].expand((x.shape[0])) + model_prev_list = [self.model_fn(x, vec_t)] + t_prev_list = [vec_t] + elif step_index < order: + init_order = step_index + # Init the first `order` values by lower order multistep DPM-Solver. + # for init_order in range(1, order): + vec_t = timesteps[init_order].expand(x.shape[0]) + x, model_x = self.multistep_uni_pc_update(x, model_prev_list, t_prev_list, vec_t, init_order, use_corrector=True) + if model_x is None: + model_x = self.model_fn(x, vec_t) + model_prev_list.append(model_x) + t_prev_list.append(vec_t) + else: + extra_final_step = 0 + if step_index == (steps - 1): + extra_final_step = 1 + for step in range(step_index, step_index + 1 + extra_final_step): + vec_t = timesteps[step].expand(x.shape[0]) + if lower_order_final: + step_order = min(order, steps + 1 - step) + else: + step_order = order + # print('this step order:', step_order) + if step == steps: + # print('do not run corrector at the last step') + use_corrector = False + else: + use_corrector = True + x, model_x = self.multistep_uni_pc_update(x, model_prev_list, t_prev_list, vec_t, step_order, use_corrector=use_corrector) + for i in range(order - 1): + t_prev_list[i] = t_prev_list[i + 1] + model_prev_list[i] = model_prev_list[i + 1] + t_prev_list[-1] = vec_t + # We do not need to evaluate the final model value. + if step < steps: + if model_x is None: + model_x = self.model_fn(x, vec_t) + model_prev_list[-1] = model_x + if callback is not None: + callback(step_index, model_prev_list[-1], x, steps) + else: + raise NotImplementedError() + # if denoise_to_zero: + # x = self.denoise_to_zero_fn(x, torch.ones((x.shape[0],)).to(device) * t_0) + return x + + +############################################################# +# other utility functions +############################################################# + +def interpolate_fn(x, xp, yp): + """ + A piecewise linear function y = f(x), using xp and yp as keypoints. + We implement f(x) in a differentiable way (i.e. applicable for autograd). + The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.) + + Args: + x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver). + xp: PyTorch tensor with shape [C, K], where K is the number of keypoints. + yp: PyTorch tensor with shape [C, K]. + Returns: + The function values f(x), with shape [N, C]. + """ + N, K = x.shape[0], xp.shape[1] + all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2) + sorted_all_x, x_indices = torch.sort(all_x, dim=2) + x_idx = torch.argmin(x_indices, dim=2) + cand_start_idx = x_idx - 1 + start_idx = torch.where( + torch.eq(x_idx, 0), + torch.tensor(1, device=x.device), + torch.where( + torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx, + ), + ) + end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1) + start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2) + end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2) + start_idx2 = torch.where( + torch.eq(x_idx, 0), + torch.tensor(0, device=x.device), + torch.where( + torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx, + ), + ) + y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1) + start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2) + end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2) + cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x) + return cand + + +def expand_dims(v, dims): + """ + Expand the tensor `v` to the dim `dims`. + + Args: + `v`: a PyTorch tensor with shape [N]. + `dim`: a `int`. + Returns: + a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`. + """ + return v[(...,) + (None,)*(dims - 1)] + + +class SigmaConvert: + schedule = "" + def marginal_log_mean_coeff(self, sigma): + return 0.5 * torch.log(1 / ((sigma * sigma) + 1)) + + def marginal_alpha(self, t): + return torch.exp(self.marginal_log_mean_coeff(t)) + + def marginal_std(self, t): + return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t))) + + def marginal_lambda(self, t): + """ + Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T]. + """ + log_mean_coeff = self.marginal_log_mean_coeff(t) + log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff)) + return log_mean_coeff - log_std + +def predict_eps_sigma(model, input, sigma_in, **kwargs): + sigma = sigma_in.view(sigma_in.shape[:1] + (1,) * (input.ndim - 1)) + input = input * ((sigma ** 2 + 1.0) ** 0.5) + return (input - model(input, sigma_in, **kwargs)) / sigma + + +def sample_unipc(model, noise, image, sigmas, max_denoise, extra_args=None, callback=None, disable=False, noise_mask=None, variant='bh1'): + timesteps = sigmas.clone() + if sigmas[-1] == 0: + timesteps = sigmas[:] + timesteps[-1] = 0.001 + else: + timesteps = sigmas.clone() + ns = SigmaConvert() + + if image is not None: + img = image * ns.marginal_alpha(timesteps[0]) + if max_denoise: + noise_mult = 1.0 + else: + noise_mult = ns.marginal_std(timesteps[0]) + img += noise * noise_mult + else: + img = noise + + model_type = "noise" + + model_fn = model_wrapper( + lambda input, sigma, **kwargs: predict_eps_sigma(model, input, sigma, **kwargs), + ns, + model_type=model_type, + guidance_type="uncond", + model_kwargs=extra_args, + ) + + order = min(3, len(timesteps) - 2) + uni_pc = UniPC(model_fn, ns, predict_x0=True, thresholding=False, noise_mask=noise_mask, masked_image=image, noise=noise, variant=variant) + x = uni_pc.sample(img, timesteps=timesteps, skip_type="time_uniform", method="multistep", order=order, lower_order_final=True, callback=callback, disable_pbar=disable) + x /= ns.marginal_alpha(timesteps[-1]) + return x diff --git a/ldm_patched/utils/__pycache__/latent_visualization.cpython-310.pyc b/ldm_patched/utils/__pycache__/latent_visualization.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e212ddbc6aa6107960fe9922275ba6a752d98af1 Binary files /dev/null and b/ldm_patched/utils/__pycache__/latent_visualization.cpython-310.pyc differ diff --git a/ldm_patched/utils/__pycache__/path_utils.cpython-310.pyc b/ldm_patched/utils/__pycache__/path_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..aff780c528de93f53f8abf5369760d68030266cb Binary files /dev/null and b/ldm_patched/utils/__pycache__/path_utils.cpython-310.pyc differ diff --git a/ldm_patched/utils/latent_visualization.py b/ldm_patched/utils/latent_visualization.py new file mode 100644 index 0000000000000000000000000000000000000000..a1ad403a1d77fd1effe0ecd153d18ae3059ddae4 --- /dev/null +++ b/ldm_patched/utils/latent_visualization.py @@ -0,0 +1,97 @@ +import torch +from PIL import Image +import struct +import numpy as np +from ldm_patched.modules.args_parser import args, LatentPreviewMethod +from ldm_patched.taesd.taesd import TAESD +import ldm_patched.utils.path_utils +import ldm_patched.modules.utils + +MAX_PREVIEW_RESOLUTION = 512 + +class LatentPreviewer: + def decode_latent_to_preview(self, x0): + pass + + def decode_latent_to_preview_image(self, preview_format, x0): + preview_image = self.decode_latent_to_preview(x0) + return ("JPEG", preview_image, MAX_PREVIEW_RESOLUTION) + +class TAESDPreviewerImpl(LatentPreviewer): + def __init__(self, taesd): + self.taesd = taesd + + def decode_latent_to_preview(self, x0): + x_sample = self.taesd.decode(x0[:1])[0].detach() + x_sample = torch.clamp((x_sample + 1.0) / 2.0, min=0.0, max=1.0) + x_sample = 255. * np.moveaxis(x_sample.cpu().numpy(), 0, 2) + x_sample = x_sample.astype(np.uint8) + + preview_image = Image.fromarray(x_sample) + return preview_image + + +class Latent2RGBPreviewer(LatentPreviewer): + def __init__(self, latent_rgb_factors): + self.latent_rgb_factors = torch.tensor(latent_rgb_factors, device="cpu") + + def decode_latent_to_preview(self, x0): + latent_image = x0[0].permute(1, 2, 0).cpu() @ self.latent_rgb_factors + + latents_ubyte = (((latent_image + 1) / 2) + .clamp(0, 1) # change scale from -1..1 to 0..1 + .mul(0xFF) # to 0..255 + .byte()).cpu() + + return Image.fromarray(latents_ubyte.numpy()) + + +def get_previewer(device, latent_format): + previewer = None + method = args.preview_option + if method != LatentPreviewMethod.NoPreviews: + # TODO previewer methods + taesd_decoder_path = None + if latent_format.taesd_decoder_name is not None: + taesd_decoder_path = next( + (fn for fn in ldm_patched.utils.path_utils.get_filename_list("vae_approx") + if fn.startswith(latent_format.taesd_decoder_name)), + "" + ) + taesd_decoder_path = ldm_patched.utils.path_utils.get_full_path("vae_approx", taesd_decoder_path) + + if method == LatentPreviewMethod.Auto: + method = LatentPreviewMethod.Latent2RGB + if taesd_decoder_path: + method = LatentPreviewMethod.TAESD + + if method == LatentPreviewMethod.TAESD: + if taesd_decoder_path: + taesd = TAESD(None, taesd_decoder_path).to(device) + previewer = TAESDPreviewerImpl(taesd) + else: + print("Warning: TAESD previews enabled, but could not find models/vae_approx/{}".format(latent_format.taesd_decoder_name)) + + if previewer is None: + if latent_format.latent_rgb_factors is not None: + previewer = Latent2RGBPreviewer(latent_format.latent_rgb_factors) + return previewer + +def prepare_callback(model, steps, x0_output_dict=None): + preview_format = "JPEG" + if preview_format not in ["JPEG", "PNG"]: + preview_format = "JPEG" + + previewer = get_previewer(model.load_device, model.model.latent_format) + + pbar = ldm_patched.modules.utils.ProgressBar(steps) + def callback(step, x0, x, total_steps): + if x0_output_dict is not None: + x0_output_dict["x0"] = x0 + + preview_bytes = None + if previewer: + preview_bytes = previewer.decode_latent_to_preview_image(preview_format, x0) + pbar.update_absolute(step + 1, total_steps, preview_bytes) + return callback + diff --git a/ldm_patched/utils/path_utils.py b/ldm_patched/utils/path_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6cae149b47c5efb959cdcb815aafb28f37307b9c --- /dev/null +++ b/ldm_patched/utils/path_utils.py @@ -0,0 +1,262 @@ +import os +import time + +supported_pt_extensions = set(['.ckpt', '.pt', '.bin', '.pth', '.safetensors']) + +folder_names_and_paths = {} + +base_path = os.getcwd() +models_dir = os.path.join(base_path, "models") +folder_names_and_paths["checkpoints"] = ([os.path.join(models_dir, "checkpoints")], supported_pt_extensions) +folder_names_and_paths["configs"] = ([os.path.join(models_dir, "configs")], [".yaml"]) + +folder_names_and_paths["loras"] = ([os.path.join(models_dir, "loras")], supported_pt_extensions) +folder_names_and_paths["vae"] = ([os.path.join(models_dir, "vae")], supported_pt_extensions) +folder_names_and_paths["clip"] = ([os.path.join(models_dir, "clip")], supported_pt_extensions) +folder_names_and_paths["unet"] = ([os.path.join(models_dir, "unet")], supported_pt_extensions) +folder_names_and_paths["clip_vision"] = ([os.path.join(models_dir, "clip_vision")], supported_pt_extensions) +folder_names_and_paths["style_models"] = ([os.path.join(models_dir, "style_models")], supported_pt_extensions) +folder_names_and_paths["embeddings"] = ([os.path.join(models_dir, "embeddings")], supported_pt_extensions) +folder_names_and_paths["diffusers"] = ([os.path.join(models_dir, "diffusers")], ["folder"]) +folder_names_and_paths["vae_approx"] = ([os.path.join(models_dir, "vae_approx")], supported_pt_extensions) + +folder_names_and_paths["controlnet"] = ([os.path.join(models_dir, "controlnet"), os.path.join(models_dir, "t2i_adapter")], supported_pt_extensions) +folder_names_and_paths["gligen"] = ([os.path.join(models_dir, "gligen")], supported_pt_extensions) + +folder_names_and_paths["upscale_models"] = ([os.path.join(models_dir, "upscale_models")], supported_pt_extensions) + +folder_names_and_paths["custom_nodes"] = ([os.path.join(base_path, "custom_nodes")], []) + +folder_names_and_paths["hypernetworks"] = ([os.path.join(models_dir, "hypernetworks")], supported_pt_extensions) + +folder_names_and_paths["photomaker"] = ([os.path.join(models_dir, "photomaker")], supported_pt_extensions) + +folder_names_and_paths["classifiers"] = ([os.path.join(models_dir, "classifiers")], {""}) + +output_directory = os.path.join(os.getcwd(), "output") +temp_directory = os.path.join(os.getcwd(), "temp") +input_directory = os.path.join(os.getcwd(), "input") +user_directory = os.path.join(os.getcwd(), "user") + +filename_list_cache = {} + +if not os.path.exists(input_directory): + try: + pass # os.makedirs(input_directory) + except: + print("Failed to create input directory") + +def set_output_directory(output_dir): + global output_directory + output_directory = output_dir + +def set_temp_directory(temp_dir): + global temp_directory + temp_directory = temp_dir + +def set_input_directory(input_dir): + global input_directory + input_directory = input_dir + +def get_output_directory(): + global output_directory + return output_directory + +def get_temp_directory(): + global temp_directory + return temp_directory + +def get_input_directory(): + global input_directory + return input_directory + + +#NOTE: used in http server so don't put folders that should not be accessed remotely +def get_directory_by_type(type_name): + if type_name == "output": + return get_output_directory() + if type_name == "temp": + return get_temp_directory() + if type_name == "input": + return get_input_directory() + return None + + +# determine base_dir rely on annotation if name is 'filename.ext [annotation]' format +# otherwise use default_path as base_dir +def annotated_filepath(name): + if name.endswith("[output]"): + base_dir = get_output_directory() + name = name[:-9] + elif name.endswith("[input]"): + base_dir = get_input_directory() + name = name[:-8] + elif name.endswith("[temp]"): + base_dir = get_temp_directory() + name = name[:-7] + else: + return name, None + + return name, base_dir + + +def get_annotated_filepath(name, default_dir=None): + name, base_dir = annotated_filepath(name) + + if base_dir is None: + if default_dir is not None: + base_dir = default_dir + else: + base_dir = get_input_directory() # fallback path + + return os.path.join(base_dir, name) + + +def exists_annotated_filepath(name): + name, base_dir = annotated_filepath(name) + + if base_dir is None: + base_dir = get_input_directory() # fallback path + + filepath = os.path.join(base_dir, name) + return os.path.exists(filepath) + + +def add_model_folder_path(folder_name, full_folder_path): + global folder_names_and_paths + if folder_name in folder_names_and_paths: + folder_names_and_paths[folder_name][0].append(full_folder_path) + else: + folder_names_and_paths[folder_name] = ([full_folder_path], set()) + +def get_folder_paths(folder_name): + return folder_names_and_paths[folder_name][0][:] + +def recursive_search(directory, excluded_dir_names=None): + if not os.path.isdir(directory): + return [], {} + + if excluded_dir_names is None: + excluded_dir_names = [] + + result = [] + dirs = {} + + # Attempt to add the initial directory to dirs with error handling + try: + dirs[directory] = os.path.getmtime(directory) + except FileNotFoundError: + print(f"Warning: Unable to access {directory}. Skipping this path.") + + for dirpath, subdirs, filenames in os.walk(directory, followlinks=True, topdown=True): + subdirs[:] = [d for d in subdirs if d not in excluded_dir_names] + for file_name in filenames: + relative_path = os.path.relpath(os.path.join(dirpath, file_name), directory) + result.append(relative_path) + + for d in subdirs: + path = os.path.join(dirpath, d) + try: + dirs[path] = os.path.getmtime(path) + except FileNotFoundError: + print(f"Warning: Unable to access {path}. Skipping this path.") + continue + return result, dirs + +def filter_files_extensions(files, extensions): + return sorted(list(filter(lambda a: os.path.splitext(a)[-1].lower() in extensions or len(extensions) == 0, files))) + + + +def get_full_path(folder_name, filename): + global folder_names_and_paths + if folder_name not in folder_names_and_paths: + return None + folders = folder_names_and_paths[folder_name] + filename = os.path.relpath(os.path.join("/", filename), "/") + for x in folders[0]: + full_path = os.path.join(x, filename) + if os.path.isfile(full_path): + return full_path + + return None + +def get_filename_list_(folder_name): + global folder_names_and_paths + output_list = set() + folders = folder_names_and_paths[folder_name] + output_folders = {} + for x in folders[0]: + files, folders_all = recursive_search(x, excluded_dir_names=[".git"]) + output_list.update(filter_files_extensions(files, folders[1])) + output_folders = {**output_folders, **folders_all} + + return (sorted(list(output_list)), output_folders, time.perf_counter()) + +def cached_filename_list_(folder_name): + global filename_list_cache + global folder_names_and_paths + if folder_name not in filename_list_cache: + return None + out = filename_list_cache[folder_name] + + for x in out[1]: + time_modified = out[1][x] + folder = x + if os.path.getmtime(folder) != time_modified: + return None + + folders = folder_names_and_paths[folder_name] + for x in folders[0]: + if os.path.isdir(x): + if x not in out[1]: + return None + + return out + +def get_filename_list(folder_name): + out = cached_filename_list_(folder_name) + if out is None: + out = get_filename_list_(folder_name) + global filename_list_cache + filename_list_cache[folder_name] = out + return list(out[0]) + +def get_save_image_path(filename_prefix, output_dir, image_width=0, image_height=0): + def map_filename(filename): + prefix_len = len(os.path.basename(filename_prefix)) + prefix = filename[:prefix_len + 1] + try: + digits = int(filename[prefix_len + 1:].split('_')[0]) + except: + digits = 0 + return (digits, prefix) + + def compute_vars(input, image_width, image_height): + input = input.replace("%width%", str(image_width)) + input = input.replace("%height%", str(image_height)) + return input + + filename_prefix = compute_vars(filename_prefix, image_width, image_height) + + subfolder = os.path.dirname(os.path.normpath(filename_prefix)) + filename = os.path.basename(os.path.normpath(filename_prefix)) + + full_output_folder = os.path.join(output_dir, subfolder) + + if os.path.commonpath((output_dir, os.path.abspath(full_output_folder))) != output_dir: + err = "**** ERROR: Saving image outside the output folder is not allowed." + \ + "\n full_output_folder: " + os.path.abspath(full_output_folder) + \ + "\n output_dir: " + output_dir + \ + "\n commonpath: " + os.path.commonpath((output_dir, os.path.abspath(full_output_folder))) + print(err) + raise Exception(err) + + try: + counter = max(filter(lambda a: a[1][:-1] == filename and a[1][-1] == "_", map(map_filename, os.listdir(full_output_folder))))[0] + 1 + except ValueError: + counter = 1 + except FileNotFoundError: + os.makedirs(full_output_folder, exist_ok=True) + counter = 1 + return full_output_folder, filename, counter, subfolder, filename_prefix diff --git a/modules/__pycache__/adm_patch.cpython-310.pyc b/modules/__pycache__/adm_patch.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d2b38d254ca3d14347cc67bb88762a8c7e2c976 Binary files /dev/null and b/modules/__pycache__/adm_patch.cpython-310.pyc differ diff --git a/modules/__pycache__/advanced_parameters.cpython-310.pyc b/modules/__pycache__/advanced_parameters.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e362b3b56507126b5e3f629bed6b1c8902c574f2 Binary files /dev/null and b/modules/__pycache__/advanced_parameters.cpython-310.pyc differ diff --git a/modules/__pycache__/anisotropic.cpython-310.pyc b/modules/__pycache__/anisotropic.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..405017a6ab75c24d3963ca331dda1d11a869fcb2 Binary files /dev/null and b/modules/__pycache__/anisotropic.cpython-310.pyc differ diff --git a/modules/__pycache__/async_worker.cpython-310.pyc b/modules/__pycache__/async_worker.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b4ff7fbccf61979af876f0cba87a1f451e7ec22 Binary files /dev/null and b/modules/__pycache__/async_worker.cpython-310.pyc differ diff --git a/modules/__pycache__/async_worker.cpython-312.pyc b/modules/__pycache__/async_worker.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..438cd8d8901cca4baf0a25db1fd4e6a220871a65 Binary files /dev/null and b/modules/__pycache__/async_worker.cpython-312.pyc differ diff --git a/modules/__pycache__/auth.cpython-310.pyc b/modules/__pycache__/auth.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5c19abf6d9d71997584f97d728a6db91bee37605 Binary files /dev/null and b/modules/__pycache__/auth.cpython-310.pyc differ diff --git a/modules/__pycache__/config.cpython-310.pyc b/modules/__pycache__/config.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d402628ef5f76168b2e4e7c4c6ae63941c78c185 Binary files /dev/null and b/modules/__pycache__/config.cpython-310.pyc differ diff --git a/modules/__pycache__/config.cpython-312.pyc b/modules/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..960788c49ef15e7b023fcbcf698eeb8ae4e8f6fa Binary files /dev/null and b/modules/__pycache__/config.cpython-312.pyc differ diff --git a/modules/__pycache__/constants.cpython-310.pyc b/modules/__pycache__/constants.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ebd2df806cdfd9f3e16478be7b36a152c086881c Binary files /dev/null and b/modules/__pycache__/constants.cpython-310.pyc differ diff --git a/modules/__pycache__/core.cpython-310.pyc b/modules/__pycache__/core.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a30c98835502b0356d1ef4c1b8e3d13d7505cb86 Binary files /dev/null and b/modules/__pycache__/core.cpython-310.pyc differ diff --git a/modules/__pycache__/cv2win32.cpython-310.pyc b/modules/__pycache__/cv2win32.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..412fccf2850881203430b137145973d83be84649 Binary files /dev/null and b/modules/__pycache__/cv2win32.cpython-310.pyc differ diff --git a/modules/__pycache__/default_pipeline.cpython-310.pyc b/modules/__pycache__/default_pipeline.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b212fffe46ad1560f7d8d06783921ad81b311359 Binary files /dev/null and b/modules/__pycache__/default_pipeline.cpython-310.pyc differ diff --git a/modules/__pycache__/expansion.cpython-310.pyc b/modules/__pycache__/expansion.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..736cd379021b182cdaff95c6c3ed86572342d5fe Binary files /dev/null and b/modules/__pycache__/expansion.cpython-310.pyc differ diff --git a/modules/__pycache__/filters.cpython-310.pyc b/modules/__pycache__/filters.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae7fd411792cab7d5334074a56694cfe0946a0a5 Binary files /dev/null and b/modules/__pycache__/filters.cpython-310.pyc differ diff --git a/modules/__pycache__/flags.cpython-310.pyc b/modules/__pycache__/flags.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..62eacd194b9ec686a9d4256d227c4370c82668d4 Binary files /dev/null and b/modules/__pycache__/flags.cpython-310.pyc differ diff --git a/modules/__pycache__/flags.cpython-312.pyc b/modules/__pycache__/flags.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f06e1ba6a88967d9f4b32c16bed764ad259ff663 Binary files /dev/null and b/modules/__pycache__/flags.cpython-312.pyc differ diff --git a/modules/__pycache__/gradio_hijack.cpython-310.pyc b/modules/__pycache__/gradio_hijack.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec0e0bffcdab99082544ec5f1d278e87a6c0375b Binary files /dev/null and b/modules/__pycache__/gradio_hijack.cpython-310.pyc differ diff --git a/modules/__pycache__/html.cpython-310.pyc b/modules/__pycache__/html.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ee802bf574d66658ec47bbba56ca98a9dec84293 Binary files /dev/null and b/modules/__pycache__/html.cpython-310.pyc differ diff --git a/modules/__pycache__/html.cpython-312.pyc b/modules/__pycache__/html.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f9d9506bf08b11058af784dfcb546a028ebcf3a Binary files /dev/null and b/modules/__pycache__/html.cpython-312.pyc differ diff --git a/modules/__pycache__/inpaint_worker.cpython-310.pyc b/modules/__pycache__/inpaint_worker.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..85bfd8495da753191b425412e6a6668354ee9a9f Binary files /dev/null and b/modules/__pycache__/inpaint_worker.cpython-310.pyc differ diff --git a/modules/__pycache__/launch_util.cpython-310.pyc b/modules/__pycache__/launch_util.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d77ad1d6e06b351e9d231250571f75a1354bfa23 Binary files /dev/null and b/modules/__pycache__/launch_util.cpython-310.pyc differ diff --git a/modules/__pycache__/launch_util.cpython-312.pyc b/modules/__pycache__/launch_util.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5faab9b84b485947f9c72505579162eb5ac86a0 Binary files /dev/null and b/modules/__pycache__/launch_util.cpython-312.pyc differ diff --git a/modules/__pycache__/localization.cpython-310.pyc b/modules/__pycache__/localization.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..50322b1e81cc765ff1b3e22f4f2c95b04420ae9c Binary files /dev/null and b/modules/__pycache__/localization.cpython-310.pyc differ diff --git a/modules/__pycache__/lora.cpython-310.pyc b/modules/__pycache__/lora.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0b63f290dcbb015e63c3872c92ff5cd89810ab52 Binary files /dev/null and b/modules/__pycache__/lora.cpython-310.pyc differ diff --git a/modules/__pycache__/meta_parser.cpython-310.pyc b/modules/__pycache__/meta_parser.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..179c6aa8f0db3ae9ce5af088d109636ffa92f189 Binary files /dev/null and b/modules/__pycache__/meta_parser.cpython-310.pyc differ diff --git a/modules/__pycache__/model_loader.cpython-310.pyc b/modules/__pycache__/model_loader.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..55a674302d732059efa9e6667a67a6d405831e88 Binary files /dev/null and b/modules/__pycache__/model_loader.cpython-310.pyc differ diff --git a/modules/__pycache__/model_loader.cpython-312.pyc b/modules/__pycache__/model_loader.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..802988ea2fdb33818c21dbd0d2593fc79173c123 Binary files /dev/null and b/modules/__pycache__/model_loader.cpython-312.pyc differ diff --git a/modules/__pycache__/ops.cpython-310.pyc b/modules/__pycache__/ops.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..40fd7d2955432ae501ea2e7a5c02f28b69838a1d Binary files /dev/null and b/modules/__pycache__/ops.cpython-310.pyc differ diff --git a/modules/__pycache__/patch.cpython-310.pyc b/modules/__pycache__/patch.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..acb1b1baaad8106c9672dc1f65b8632a769eb4f7 Binary files /dev/null and b/modules/__pycache__/patch.cpython-310.pyc differ diff --git a/modules/__pycache__/patch.cpython-312.pyc b/modules/__pycache__/patch.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0ccedcb888de32f4cc15e792a98c96a0913210fa Binary files /dev/null and b/modules/__pycache__/patch.cpython-312.pyc differ diff --git a/modules/__pycache__/patch_clip.cpython-310.pyc b/modules/__pycache__/patch_clip.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..00f662bca01f5a747607e1aa234a3ee6667312d9 Binary files /dev/null and b/modules/__pycache__/patch_clip.cpython-310.pyc differ diff --git a/modules/__pycache__/patch_precision.cpython-310.pyc b/modules/__pycache__/patch_precision.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..56ef76fd17b7c3425ec37fd666eca3ac156323b9 Binary files /dev/null and b/modules/__pycache__/patch_precision.cpython-310.pyc differ diff --git a/modules/__pycache__/path.cpython-310.pyc b/modules/__pycache__/path.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ddcaa406185794df163c12c8be18ad212e078c1 Binary files /dev/null and b/modules/__pycache__/path.cpython-310.pyc differ diff --git a/modules/__pycache__/private_logger.cpython-310.pyc b/modules/__pycache__/private_logger.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..518e53c234dcad3d0daec96815ad5ead376b4ff1 Binary files /dev/null and b/modules/__pycache__/private_logger.cpython-310.pyc differ diff --git a/modules/__pycache__/sample_hijack.cpython-310.pyc b/modules/__pycache__/sample_hijack.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..77a465b53cfe3d652373bc0433ca065ebe988644 Binary files /dev/null and b/modules/__pycache__/sample_hijack.cpython-310.pyc differ diff --git a/modules/__pycache__/samplers_advanced.cpython-310.pyc b/modules/__pycache__/samplers_advanced.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a140b6406573486fc6853b6c9cadd23b8b61a64d Binary files /dev/null and b/modules/__pycache__/samplers_advanced.cpython-310.pyc differ diff --git a/modules/__pycache__/sdxl_styles.cpython-310.pyc b/modules/__pycache__/sdxl_styles.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a4db3f282d380ee917b635abd532fb755ffdc53d Binary files /dev/null and b/modules/__pycache__/sdxl_styles.cpython-310.pyc differ diff --git a/modules/__pycache__/sdxl_styles.cpython-312.pyc b/modules/__pycache__/sdxl_styles.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d94a07d1dc6fb724de5eae795af4b7ebeef87ea Binary files /dev/null and b/modules/__pycache__/sdxl_styles.cpython-312.pyc differ diff --git a/modules/__pycache__/style_sorter.cpython-310.pyc b/modules/__pycache__/style_sorter.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..864c1b4b9ad35e6254e468fd9a4b8e600b6eb92c Binary files /dev/null and b/modules/__pycache__/style_sorter.cpython-310.pyc differ diff --git a/modules/__pycache__/ui_gradio_extensions.cpython-310.pyc b/modules/__pycache__/ui_gradio_extensions.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..daa1d07f8eed613518c94cbd6b7370e1cfb1c0bf Binary files /dev/null and b/modules/__pycache__/ui_gradio_extensions.cpython-310.pyc differ diff --git a/modules/__pycache__/upscaler.cpython-310.pyc b/modules/__pycache__/upscaler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7498d2c4484ce4479fdb0da8572d240734c6e9d9 Binary files /dev/null and b/modules/__pycache__/upscaler.cpython-310.pyc differ diff --git a/modules/__pycache__/util.cpython-310.pyc b/modules/__pycache__/util.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3735a9a9d85028182c62918ac11ed2ca936a8e4a Binary files /dev/null and b/modules/__pycache__/util.cpython-310.pyc differ diff --git a/modules/__pycache__/util.cpython-312.pyc b/modules/__pycache__/util.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bda34a2330d433469a4c0fa88376116b04f33ae7 Binary files /dev/null and b/modules/__pycache__/util.cpython-312.pyc differ diff --git a/modules/anisotropic.py b/modules/anisotropic.py new file mode 100644 index 0000000000000000000000000000000000000000..576822240762b7dfcfb27e49364314ee1cb436d9 --- /dev/null +++ b/modules/anisotropic.py @@ -0,0 +1,200 @@ +import torch + + +Tensor = torch.Tensor +Device = torch.DeviceObjType +Dtype = torch.Type +pad = torch.nn.functional.pad + + +def _compute_zero_padding(kernel_size: tuple[int, int] | int) -> tuple[int, int]: + ky, kx = _unpack_2d_ks(kernel_size) + return (ky - 1) // 2, (kx - 1) // 2 + + +def _unpack_2d_ks(kernel_size: tuple[int, int] | int) -> tuple[int, int]: + if isinstance(kernel_size, int): + ky = kx = kernel_size + else: + assert len(kernel_size) == 2, '2D Kernel size should have a length of 2.' + ky, kx = kernel_size + + ky = int(ky) + kx = int(kx) + return ky, kx + + +def gaussian( + window_size: int, sigma: Tensor | float, *, device: Device | None = None, dtype: Dtype | None = None +) -> Tensor: + + batch_size = sigma.shape[0] + + x = (torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) - window_size // 2).expand(batch_size, -1) + + if window_size % 2 == 0: + x = x + 0.5 + + gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0))) + + return gauss / gauss.sum(-1, keepdim=True) + + +def get_gaussian_kernel1d( + kernel_size: int, + sigma: float | Tensor, + force_even: bool = False, + *, + device: Device | None = None, + dtype: Dtype | None = None, +) -> Tensor: + + return gaussian(kernel_size, sigma, device=device, dtype=dtype) + + +def get_gaussian_kernel2d( + kernel_size: tuple[int, int] | int, + sigma: tuple[float, float] | Tensor, + force_even: bool = False, + *, + device: Device | None = None, + dtype: Dtype | None = None, +) -> Tensor: + + sigma = torch.Tensor([[sigma, sigma]]).to(device=device, dtype=dtype) + + ksize_y, ksize_x = _unpack_2d_ks(kernel_size) + sigma_y, sigma_x = sigma[:, 0, None], sigma[:, 1, None] + + kernel_y = get_gaussian_kernel1d(ksize_y, sigma_y, force_even, device=device, dtype=dtype)[..., None] + kernel_x = get_gaussian_kernel1d(ksize_x, sigma_x, force_even, device=device, dtype=dtype)[..., None] + + return kernel_y * kernel_x.view(-1, 1, ksize_x) + + +def _bilateral_blur( + input: Tensor, + guidance: Tensor | None, + kernel_size: tuple[int, int] | int, + sigma_color: float | Tensor, + sigma_space: tuple[float, float] | Tensor, + border_type: str = 'reflect', + color_distance_type: str = 'l1', +) -> Tensor: + + if isinstance(sigma_color, Tensor): + sigma_color = sigma_color.to(device=input.device, dtype=input.dtype).view(-1, 1, 1, 1, 1) + + ky, kx = _unpack_2d_ks(kernel_size) + pad_y, pad_x = _compute_zero_padding(kernel_size) + + padded_input = pad(input, (pad_x, pad_x, pad_y, pad_y), mode=border_type) + unfolded_input = padded_input.unfold(2, ky, 1).unfold(3, kx, 1).flatten(-2) # (B, C, H, W, Ky x Kx) + + if guidance is None: + guidance = input + unfolded_guidance = unfolded_input + else: + padded_guidance = pad(guidance, (pad_x, pad_x, pad_y, pad_y), mode=border_type) + unfolded_guidance = padded_guidance.unfold(2, ky, 1).unfold(3, kx, 1).flatten(-2) # (B, C, H, W, Ky x Kx) + + diff = unfolded_guidance - guidance.unsqueeze(-1) + if color_distance_type == "l1": + color_distance_sq = diff.abs().sum(1, keepdim=True).square() + elif color_distance_type == "l2": + color_distance_sq = diff.square().sum(1, keepdim=True) + else: + raise ValueError("color_distance_type only acceps l1 or l2") + color_kernel = (-0.5 / sigma_color**2 * color_distance_sq).exp() # (B, 1, H, W, Ky x Kx) + + space_kernel = get_gaussian_kernel2d(kernel_size, sigma_space, device=input.device, dtype=input.dtype) + space_kernel = space_kernel.view(-1, 1, 1, 1, kx * ky) + + kernel = space_kernel * color_kernel + out = (unfolded_input * kernel).sum(-1) / kernel.sum(-1) + return out + + +def bilateral_blur( + input: Tensor, + kernel_size: tuple[int, int] | int = (13, 13), + sigma_color: float | Tensor = 3.0, + sigma_space: tuple[float, float] | Tensor = 3.0, + border_type: str = 'reflect', + color_distance_type: str = 'l1', +) -> Tensor: + return _bilateral_blur(input, None, kernel_size, sigma_color, sigma_space, border_type, color_distance_type) + + +def adaptive_anisotropic_filter(x, g=None): + if g is None: + g = x + s, m = torch.std_mean(g, dim=(1, 2, 3), keepdim=True) + s = s + 1e-5 + guidance = (g - m) / s + y = _bilateral_blur(x, guidance, + kernel_size=(13, 13), + sigma_color=3.0, + sigma_space=3.0, + border_type='reflect', + color_distance_type='l1') + return y + + +def joint_bilateral_blur( + input: Tensor, + guidance: Tensor, + kernel_size: tuple[int, int] | int, + sigma_color: float | Tensor, + sigma_space: tuple[float, float] | Tensor, + border_type: str = 'reflect', + color_distance_type: str = 'l1', +) -> Tensor: + return _bilateral_blur(input, guidance, kernel_size, sigma_color, sigma_space, border_type, color_distance_type) + + +class _BilateralBlur(torch.nn.Module): + def __init__( + self, + kernel_size: tuple[int, int] | int, + sigma_color: float | Tensor, + sigma_space: tuple[float, float] | Tensor, + border_type: str = 'reflect', + color_distance_type: str = "l1", + ) -> None: + super().__init__() + self.kernel_size = kernel_size + self.sigma_color = sigma_color + self.sigma_space = sigma_space + self.border_type = border_type + self.color_distance_type = color_distance_type + + def __repr__(self) -> str: + return ( + f"{self.__class__.__name__}" + f"(kernel_size={self.kernel_size}, " + f"sigma_color={self.sigma_color}, " + f"sigma_space={self.sigma_space}, " + f"border_type={self.border_type}, " + f"color_distance_type={self.color_distance_type})" + ) + + +class BilateralBlur(_BilateralBlur): + def forward(self, input: Tensor) -> Tensor: + return bilateral_blur( + input, self.kernel_size, self.sigma_color, self.sigma_space, self.border_type, self.color_distance_type + ) + + +class JointBilateralBlur(_BilateralBlur): + def forward(self, input: Tensor, guidance: Tensor) -> Tensor: + return joint_bilateral_blur( + input, + guidance, + self.kernel_size, + self.sigma_color, + self.sigma_space, + self.border_type, + self.color_distance_type, + ) diff --git a/modules/async_worker.py b/modules/async_worker.py new file mode 100644 index 0000000000000000000000000000000000000000..83fc39123f169a3e96a7e0bcf4e9ca92f046f7d0 --- /dev/null +++ b/modules/async_worker.py @@ -0,0 +1,914 @@ +import threading +from modules.patch import PatchSettings, patch_settings, patch_all + +patch_all() + +class AsyncTask: + def __init__(self, args): + self.args = args + self.yields = [] + self.results = [] + self.last_stop = False + self.processing = False + + +async_tasks = [] + + +def worker(): + global async_tasks + + import os + import traceback + import math + import numpy as np + import cv2 + import torch + import time + import shared + import random + import copy + import modules.default_pipeline as pipeline + import modules.core as core + import modules.flags as flags + import modules.config + import modules.patch + import ldm_patched.modules.model_management + import extras.preprocessors as preprocessors + import modules.inpaint_worker as inpaint_worker + import modules.constants as constants + import extras.ip_adapter as ip_adapter + import extras.face_crop + import fooocus_version + import args_manager + + from modules.sdxl_styles import apply_style, apply_wildcards, fooocus_expansion, apply_arrays + from modules.private_logger import log + from extras.expansion import safe_str + from modules.util import remove_empty_str, HWC3, resize_image, \ + get_image_shape_ceil, set_image_shape_ceil, get_shape_ceil, resample_image, erode_or_dilate, ordinal_suffix + from modules.upscaler import perform_upscale + from modules.flags import Performance + from modules.meta_parser import get_metadata_parser, MetadataScheme + + pid = os.getpid() + print(f'Started worker with PID {pid}') + + try: + async_gradio_app = shared.gradio_root + flag = f'''App started successful. Use the app with {str(async_gradio_app.local_url)} or {str(async_gradio_app.server_name)}:{str(async_gradio_app.server_port)}''' + if async_gradio_app.share: + flag += f''' or {async_gradio_app.share_url}''' + print(flag) + except Exception as e: + print(e) + + def progressbar(async_task, number, text): + print(f'[Fooocus] {text}') + async_task.yields.append(['preview', (number, text, None)]) + + def yield_result(async_task, imgs, do_not_show_finished_images=False): + if not isinstance(imgs, list): + imgs = [imgs] + + async_task.results = async_task.results + imgs + + if do_not_show_finished_images: + return + + async_task.yields.append(['results', async_task.results]) + return + + def build_image_wall(async_task): + results = [] + + if len(async_task.results) < 2: + return + + for img in async_task.results: + if isinstance(img, str) and os.path.exists(img): + img = cv2.imread(img) + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + if not isinstance(img, np.ndarray): + return + if img.ndim != 3: + return + results.append(img) + + H, W, C = results[0].shape + + for img in results: + Hn, Wn, Cn = img.shape + if H != Hn: + return + if W != Wn: + return + if C != Cn: + return + + cols = float(len(results)) ** 0.5 + cols = int(math.ceil(cols)) + rows = float(len(results)) / float(cols) + rows = int(math.ceil(rows)) + + wall = np.zeros(shape=(H * rows, W * cols, C), dtype=np.uint8) + + for y in range(rows): + for x in range(cols): + if y * cols + x < len(results): + img = results[y * cols + x] + wall[y * H:y * H + H, x * W:x * W + W, :] = img + + # must use deep copy otherwise gradio is super laggy. Do not use list.append() . + async_task.results = async_task.results + [wall] + return + + def apply_enabled_loras(loras): + enabled_loras = [] + for lora_enabled, lora_model, lora_weight in loras: + if lora_enabled: + enabled_loras.append([lora_model, lora_weight]) + + return enabled_loras + + @torch.no_grad() + @torch.inference_mode() + def handler(async_task): + execution_start_time = time.perf_counter() + async_task.processing = True + + args = async_task.args + args.reverse() + + prompt = args.pop() + negative_prompt = args.pop() + style_selections = args.pop() + performance_selection = Performance(args.pop()) + aspect_ratios_selection = args.pop() + image_number = args.pop() + output_format = args.pop() + image_seed = args.pop() + sharpness = args.pop() + guidance_scale = args.pop() + base_model_name = args.pop() + refiner_model_name = args.pop() + refiner_switch = args.pop() + loras = apply_enabled_loras([[bool(args.pop()), str(args.pop()), float(args.pop()), ] for _ in range(modules.config.default_max_lora_number)]) + input_image_checkbox = args.pop() + current_tab = args.pop() + uov_method = args.pop() + uov_input_image = args.pop() + outpaint_selections = args.pop() + inpaint_input_image = args.pop() + inpaint_additional_prompt = args.pop() + inpaint_mask_image_upload = args.pop() + + disable_preview = args.pop() + disable_intermediate_results = args.pop() + disable_seed_increment = args.pop() + adm_scaler_positive = args.pop() + adm_scaler_negative = args.pop() + adm_scaler_end = args.pop() + adaptive_cfg = args.pop() + sampler_name = args.pop() + scheduler_name = args.pop() + overwrite_step = args.pop() + overwrite_switch = args.pop() + overwrite_width = args.pop() + overwrite_height = args.pop() + overwrite_vary_strength = args.pop() + overwrite_upscale_strength = args.pop() + mixing_image_prompt_and_vary_upscale = args.pop() + mixing_image_prompt_and_inpaint = args.pop() + debugging_cn_preprocessor = args.pop() + skipping_cn_preprocessor = args.pop() + canny_low_threshold = args.pop() + canny_high_threshold = args.pop() + refiner_swap_method = args.pop() + controlnet_softness = args.pop() + freeu_enabled = args.pop() + freeu_b1 = args.pop() + freeu_b2 = args.pop() + freeu_s1 = args.pop() + freeu_s2 = args.pop() + debugging_inpaint_preprocessor = args.pop() + inpaint_disable_initial_latent = args.pop() + inpaint_engine = args.pop() + inpaint_strength = args.pop() + inpaint_respective_field = args.pop() + inpaint_mask_upload_checkbox = args.pop() + invert_mask_checkbox = args.pop() + inpaint_erode_or_dilate = args.pop() + + save_metadata_to_images = args.pop() if not args_manager.args.disable_metadata else False + metadata_scheme = MetadataScheme(args.pop()) if not args_manager.args.disable_metadata else MetadataScheme.FOOOCUS + + cn_tasks = {x: [] for x in flags.ip_list} + for _ in range(flags.controlnet_image_count): + cn_img = args.pop() + cn_stop = args.pop() + cn_weight = args.pop() + cn_type = args.pop() + if cn_img is not None: + cn_tasks[cn_type].append([cn_img, cn_stop, cn_weight]) + + outpaint_selections = [o.lower() for o in outpaint_selections] + base_model_additional_loras = [] + raw_style_selections = copy.deepcopy(style_selections) + uov_method = uov_method.lower() + + if fooocus_expansion in style_selections: + use_expansion = True + style_selections.remove(fooocus_expansion) + else: + use_expansion = False + + use_style = len(style_selections) > 0 + + if base_model_name == refiner_model_name: + print(f'Refiner disabled because base model and refiner are same.') + refiner_model_name = 'None' + + steps = performance_selection.steps() + + if performance_selection == Performance.EXTREME_SPEED: + print('Enter LCM mode.') + progressbar(async_task, 1, 'Downloading LCM components ...') + loras += [(modules.config.downloading_sdxl_lcm_lora(), 1.0)] + + if refiner_model_name != 'None': + print(f'Refiner disabled in LCM mode.') + + refiner_model_name = 'None' + sampler_name = 'lcm' + scheduler_name = 'lcm' + sharpness = 0.0 + guidance_scale = 1.0 + adaptive_cfg = 1.0 + refiner_switch = 1.0 + adm_scaler_positive = 1.0 + adm_scaler_negative = 1.0 + adm_scaler_end = 0.0 + + print(f'[Parameters] Adaptive CFG = {adaptive_cfg}') + print(f'[Parameters] Sharpness = {sharpness}') + print(f'[Parameters] ControlNet Softness = {controlnet_softness}') + print(f'[Parameters] ADM Scale = ' + f'{adm_scaler_positive} : ' + f'{adm_scaler_negative} : ' + f'{adm_scaler_end}') + + patch_settings[pid] = PatchSettings( + sharpness, + adm_scaler_end, + adm_scaler_positive, + adm_scaler_negative, + controlnet_softness, + adaptive_cfg + ) + + cfg_scale = float(guidance_scale) + print(f'[Parameters] CFG = {cfg_scale}') + + initial_latent = None + denoising_strength = 1.0 + tiled = False + + width, height = aspect_ratios_selection.replace('×', ' ').split(' ')[:2] + width, height = int(width), int(height) + + skip_prompt_processing = False + + inpaint_worker.current_task = None + inpaint_parameterized = inpaint_engine != 'None' + inpaint_image = None + inpaint_mask = None + inpaint_head_model_path = None + + use_synthetic_refiner = False + + controlnet_canny_path = None + controlnet_cpds_path = None + clip_vision_path, ip_negative_path, ip_adapter_path, ip_adapter_face_path = None, None, None, None + + seed = int(image_seed) + print(f'[Parameters] Seed = {seed}') + + goals = [] + tasks = [] + + if input_image_checkbox: + if (current_tab == 'uov' or ( + current_tab == 'ip' and mixing_image_prompt_and_vary_upscale)) \ + and uov_method != flags.disabled and uov_input_image is not None: + uov_input_image = HWC3(uov_input_image) + if 'vary' in uov_method: + goals.append('vary') + elif 'upscale' in uov_method: + goals.append('upscale') + if 'fast' in uov_method: + skip_prompt_processing = True + else: + steps = performance_selection.steps_uov() + + progressbar(async_task, 1, 'Downloading upscale models ...') + modules.config.downloading_upscale_model() + if (current_tab == 'inpaint' or ( + current_tab == 'ip' and mixing_image_prompt_and_inpaint)) \ + and isinstance(inpaint_input_image, dict): + inpaint_image = inpaint_input_image['image'] + inpaint_mask = inpaint_input_image['mask'][:, :, 0] + + if inpaint_mask_upload_checkbox: + if isinstance(inpaint_mask_image_upload, np.ndarray): + if inpaint_mask_image_upload.ndim == 3: + H, W, C = inpaint_image.shape + inpaint_mask_image_upload = resample_image(inpaint_mask_image_upload, width=W, height=H) + inpaint_mask_image_upload = np.mean(inpaint_mask_image_upload, axis=2) + inpaint_mask_image_upload = (inpaint_mask_image_upload > 127).astype(np.uint8) * 255 + inpaint_mask = np.maximum(inpaint_mask, inpaint_mask_image_upload) + + if int(inpaint_erode_or_dilate) != 0: + inpaint_mask = erode_or_dilate(inpaint_mask, inpaint_erode_or_dilate) + + if invert_mask_checkbox: + inpaint_mask = 255 - inpaint_mask + + inpaint_image = HWC3(inpaint_image) + if isinstance(inpaint_image, np.ndarray) and isinstance(inpaint_mask, np.ndarray) \ + and (np.any(inpaint_mask > 127) or len(outpaint_selections) > 0): + progressbar(async_task, 1, 'Downloading upscale models ...') + modules.config.downloading_upscale_model() + if inpaint_parameterized: + progressbar(async_task, 1, 'Downloading inpainter ...') + inpaint_head_model_path, inpaint_patch_model_path = modules.config.downloading_inpaint_models( + inpaint_engine) + base_model_additional_loras += [(inpaint_patch_model_path, 1.0)] + print(f'[Inpaint] Current inpaint model is {inpaint_patch_model_path}') + if refiner_model_name == 'None': + use_synthetic_refiner = True + refiner_switch = 0.5 + else: + inpaint_head_model_path, inpaint_patch_model_path = None, None + print(f'[Inpaint] Parameterized inpaint is disabled.') + if inpaint_additional_prompt != '': + if prompt == '': + prompt = inpaint_additional_prompt + else: + prompt = inpaint_additional_prompt + '\n' + prompt + goals.append('inpaint') + if current_tab == 'ip' or \ + mixing_image_prompt_and_vary_upscale or \ + mixing_image_prompt_and_inpaint: + goals.append('cn') + progressbar(async_task, 1, 'Downloading control models ...') + if len(cn_tasks[flags.cn_canny]) > 0: + controlnet_canny_path = modules.config.downloading_controlnet_canny() + if len(cn_tasks[flags.cn_cpds]) > 0: + controlnet_cpds_path = modules.config.downloading_controlnet_cpds() + if len(cn_tasks[flags.cn_ip]) > 0: + clip_vision_path, ip_negative_path, ip_adapter_path = modules.config.downloading_ip_adapters('ip') + if len(cn_tasks[flags.cn_ip_face]) > 0: + clip_vision_path, ip_negative_path, ip_adapter_face_path = modules.config.downloading_ip_adapters( + 'face') + progressbar(async_task, 1, 'Loading control models ...') + + # Load or unload CNs + pipeline.refresh_controlnets([controlnet_canny_path, controlnet_cpds_path]) + ip_adapter.load_ip_adapter(clip_vision_path, ip_negative_path, ip_adapter_path) + ip_adapter.load_ip_adapter(clip_vision_path, ip_negative_path, ip_adapter_face_path) + + if overwrite_step > 0: + steps = overwrite_step + + switch = int(round(steps * refiner_switch)) + + if overwrite_switch > 0: + switch = overwrite_switch + + if overwrite_width > 0: + width = overwrite_width + + if overwrite_height > 0: + height = overwrite_height + + print(f'[Parameters] Sampler = {sampler_name} - {scheduler_name}') + print(f'[Parameters] Steps = {steps} - {switch}') + + progressbar(async_task, 1, 'Initializing ...') + + if not skip_prompt_processing: + + prompts = remove_empty_str([safe_str(p) for p in prompt.splitlines()], default='') + negative_prompts = remove_empty_str([safe_str(p) for p in negative_prompt.splitlines()], default='') + + prompt = prompts[0] + negative_prompt = negative_prompts[0] + + if prompt == '': + # disable expansion when empty since it is not meaningful and influences image prompt + use_expansion = False + + extra_positive_prompts = prompts[1:] if len(prompts) > 1 else [] + extra_negative_prompts = negative_prompts[1:] if len(negative_prompts) > 1 else [] + + progressbar(async_task, 3, 'Loading models ...') + pipeline.refresh_everything(refiner_model_name=refiner_model_name, base_model_name=base_model_name, + loras=loras, base_model_additional_loras=base_model_additional_loras, + use_synthetic_refiner=use_synthetic_refiner) + + progressbar(async_task, 3, 'Processing prompts ...') + tasks = [] + + for i in range(image_number): + if disable_seed_increment: + task_seed = seed + else: + task_seed = (seed + i) % (constants.MAX_SEED + 1) # randint is inclusive, % is not + + task_rng = random.Random(task_seed) # may bind to inpaint noise in the future + task_prompt = apply_wildcards(prompt, task_rng) + task_prompt = apply_arrays(task_prompt, i) + task_negative_prompt = apply_wildcards(negative_prompt, task_rng) + task_extra_positive_prompts = [apply_wildcards(pmt, task_rng) for pmt in extra_positive_prompts] + task_extra_negative_prompts = [apply_wildcards(pmt, task_rng) for pmt in extra_negative_prompts] + + positive_basic_workloads = [] + negative_basic_workloads = [] + + if use_style: + for s in style_selections: + p, n = apply_style(s, positive=task_prompt) + positive_basic_workloads = positive_basic_workloads + p + negative_basic_workloads = negative_basic_workloads + n + else: + positive_basic_workloads.append(task_prompt) + + negative_basic_workloads.append(task_negative_prompt) # Always use independent workload for negative. + + positive_basic_workloads = positive_basic_workloads + task_extra_positive_prompts + negative_basic_workloads = negative_basic_workloads + task_extra_negative_prompts + + positive_basic_workloads = remove_empty_str(positive_basic_workloads, default=task_prompt) + negative_basic_workloads = remove_empty_str(negative_basic_workloads, default=task_negative_prompt) + + tasks.append(dict( + task_seed=task_seed, + task_prompt=task_prompt, + task_negative_prompt=task_negative_prompt, + positive=positive_basic_workloads, + negative=negative_basic_workloads, + expansion='', + c=None, + uc=None, + positive_top_k=len(positive_basic_workloads), + negative_top_k=len(negative_basic_workloads), + log_positive_prompt='\n'.join([task_prompt] + task_extra_positive_prompts), + log_negative_prompt='\n'.join([task_negative_prompt] + task_extra_negative_prompts), + )) + + if use_expansion: + for i, t in enumerate(tasks): + progressbar(async_task, 5, f'Preparing Fooocus text #{i + 1} ...') + expansion = pipeline.final_expansion(t['task_prompt'], t['task_seed']) + print(f'[Prompt Expansion] {expansion}') + t['expansion'] = expansion + t['positive'] = copy.deepcopy(t['positive']) + [expansion] # Deep copy. + + for i, t in enumerate(tasks): + progressbar(async_task, 7, f'Encoding positive #{i + 1} ...') + t['c'] = pipeline.clip_encode(texts=t['positive'], pool_top_k=t['positive_top_k']) + + for i, t in enumerate(tasks): + if abs(float(cfg_scale) - 1.0) < 1e-4: + t['uc'] = pipeline.clone_cond(t['c']) + else: + progressbar(async_task, 10, f'Encoding negative #{i + 1} ...') + t['uc'] = pipeline.clip_encode(texts=t['negative'], pool_top_k=t['negative_top_k']) + + if len(goals) > 0: + progressbar(async_task, 13, 'Image processing ...') + + if 'vary' in goals: + if 'subtle' in uov_method: + denoising_strength = 0.5 + if 'strong' in uov_method: + denoising_strength = 0.85 + if overwrite_vary_strength > 0: + denoising_strength = overwrite_vary_strength + + shape_ceil = get_image_shape_ceil(uov_input_image) + if shape_ceil < 1024: + print(f'[Vary] Image is resized because it is too small.') + shape_ceil = 1024 + elif shape_ceil > 2048: + print(f'[Vary] Image is resized because it is too big.') + shape_ceil = 2048 + + uov_input_image = set_image_shape_ceil(uov_input_image, shape_ceil) + + initial_pixels = core.numpy_to_pytorch(uov_input_image) + progressbar(async_task, 13, 'VAE encoding ...') + + candidate_vae, _ = pipeline.get_candidate_vae( + steps=steps, + switch=switch, + denoise=denoising_strength, + refiner_swap_method=refiner_swap_method + ) + + initial_latent = core.encode_vae(vae=candidate_vae, pixels=initial_pixels) + B, C, H, W = initial_latent['samples'].shape + width = W * 8 + height = H * 8 + print(f'Final resolution is {str((height, width))}.') + + if 'upscale' in goals: + H, W, C = uov_input_image.shape + progressbar(async_task, 13, f'Upscaling image from {str((H, W))} ...') + uov_input_image = perform_upscale(uov_input_image) + print(f'Image upscaled.') + + if '1.5x' in uov_method: + f = 1.5 + elif '2x' in uov_method: + f = 2.0 + else: + f = 1.0 + + shape_ceil = get_shape_ceil(H * f, W * f) + + if shape_ceil < 1024: + print(f'[Upscale] Image is resized because it is too small.') + uov_input_image = set_image_shape_ceil(uov_input_image, 1024) + shape_ceil = 1024 + else: + uov_input_image = resample_image(uov_input_image, width=W * f, height=H * f) + + image_is_super_large = shape_ceil > 2800 + + if 'fast' in uov_method: + direct_return = True + elif image_is_super_large: + print('Image is too large. Directly returned the SR image. ' + 'Usually directly return SR image at 4K resolution ' + 'yields better results than SDXL diffusion.') + direct_return = True + else: + direct_return = False + + if direct_return: + d = [('Upscale (Fast)', 'upscale_fast', '2x')] + uov_input_image_path = log(uov_input_image, d, output_format=output_format) + yield_result(async_task, uov_input_image_path, do_not_show_finished_images=True) + return + + tiled = True + denoising_strength = 0.382 + + if overwrite_upscale_strength > 0: + denoising_strength = overwrite_upscale_strength + + initial_pixels = core.numpy_to_pytorch(uov_input_image) + progressbar(async_task, 13, 'VAE encoding ...') + + candidate_vae, _ = pipeline.get_candidate_vae( + steps=steps, + switch=switch, + denoise=denoising_strength, + refiner_swap_method=refiner_swap_method + ) + + initial_latent = core.encode_vae( + vae=candidate_vae, + pixels=initial_pixels, tiled=True) + B, C, H, W = initial_latent['samples'].shape + width = W * 8 + height = H * 8 + print(f'Final resolution is {str((height, width))}.') + + if 'inpaint' in goals: + if len(outpaint_selections) > 0: + H, W, C = inpaint_image.shape + if 'top' in outpaint_selections: + inpaint_image = np.pad(inpaint_image, [[int(H * 0.3), 0], [0, 0], [0, 0]], mode='edge') + inpaint_mask = np.pad(inpaint_mask, [[int(H * 0.3), 0], [0, 0]], mode='constant', + constant_values=255) + if 'bottom' in outpaint_selections: + inpaint_image = np.pad(inpaint_image, [[0, int(H * 0.3)], [0, 0], [0, 0]], mode='edge') + inpaint_mask = np.pad(inpaint_mask, [[0, int(H * 0.3)], [0, 0]], mode='constant', + constant_values=255) + + H, W, C = inpaint_image.shape + if 'left' in outpaint_selections: + inpaint_image = np.pad(inpaint_image, [[0, 0], [int(H * 0.3), 0], [0, 0]], mode='edge') + inpaint_mask = np.pad(inpaint_mask, [[0, 0], [int(H * 0.3), 0]], mode='constant', + constant_values=255) + if 'right' in outpaint_selections: + inpaint_image = np.pad(inpaint_image, [[0, 0], [0, int(H * 0.3)], [0, 0]], mode='edge') + inpaint_mask = np.pad(inpaint_mask, [[0, 0], [0, int(H * 0.3)]], mode='constant', + constant_values=255) + + inpaint_image = np.ascontiguousarray(inpaint_image.copy()) + inpaint_mask = np.ascontiguousarray(inpaint_mask.copy()) + inpaint_strength = 1.0 + inpaint_respective_field = 1.0 + + denoising_strength = inpaint_strength + + inpaint_worker.current_task = inpaint_worker.InpaintWorker( + image=inpaint_image, + mask=inpaint_mask, + use_fill=denoising_strength > 0.99, + k=inpaint_respective_field + ) + + if debugging_inpaint_preprocessor: + yield_result(async_task, inpaint_worker.current_task.visualize_mask_processing(), + do_not_show_finished_images=True) + return + + progressbar(async_task, 13, 'VAE Inpaint encoding ...') + + inpaint_pixel_fill = core.numpy_to_pytorch(inpaint_worker.current_task.interested_fill) + inpaint_pixel_image = core.numpy_to_pytorch(inpaint_worker.current_task.interested_image) + inpaint_pixel_mask = core.numpy_to_pytorch(inpaint_worker.current_task.interested_mask) + + candidate_vae, candidate_vae_swap = pipeline.get_candidate_vae( + steps=steps, + switch=switch, + denoise=denoising_strength, + refiner_swap_method=refiner_swap_method + ) + + latent_inpaint, latent_mask = core.encode_vae_inpaint( + mask=inpaint_pixel_mask, + vae=candidate_vae, + pixels=inpaint_pixel_image) + + latent_swap = None + if candidate_vae_swap is not None: + progressbar(async_task, 13, 'VAE SD15 encoding ...') + latent_swap = core.encode_vae( + vae=candidate_vae_swap, + pixels=inpaint_pixel_fill)['samples'] + + progressbar(async_task, 13, 'VAE encoding ...') + latent_fill = core.encode_vae( + vae=candidate_vae, + pixels=inpaint_pixel_fill)['samples'] + + inpaint_worker.current_task.load_latent( + latent_fill=latent_fill, latent_mask=latent_mask, latent_swap=latent_swap) + + if inpaint_parameterized: + pipeline.final_unet = inpaint_worker.current_task.patch( + inpaint_head_model_path=inpaint_head_model_path, + inpaint_latent=latent_inpaint, + inpaint_latent_mask=latent_mask, + model=pipeline.final_unet + ) + + if not inpaint_disable_initial_latent: + initial_latent = {'samples': latent_fill} + + B, C, H, W = latent_fill.shape + height, width = H * 8, W * 8 + final_height, final_width = inpaint_worker.current_task.image.shape[:2] + print(f'Final resolution is {str((final_height, final_width))}, latent is {str((height, width))}.') + + if 'cn' in goals: + for task in cn_tasks[flags.cn_canny]: + cn_img, cn_stop, cn_weight = task + cn_img = resize_image(HWC3(cn_img), width=width, height=height) + + if not skipping_cn_preprocessor: + cn_img = preprocessors.canny_pyramid(cn_img, canny_low_threshold, canny_high_threshold) + + cn_img = HWC3(cn_img) + task[0] = core.numpy_to_pytorch(cn_img) + if debugging_cn_preprocessor: + yield_result(async_task, cn_img, do_not_show_finished_images=True) + return + for task in cn_tasks[flags.cn_cpds]: + cn_img, cn_stop, cn_weight = task + cn_img = resize_image(HWC3(cn_img), width=width, height=height) + + if not skipping_cn_preprocessor: + cn_img = preprocessors.cpds(cn_img) + + cn_img = HWC3(cn_img) + task[0] = core.numpy_to_pytorch(cn_img) + if debugging_cn_preprocessor: + yield_result(async_task, cn_img, do_not_show_finished_images=True) + return + for task in cn_tasks[flags.cn_ip]: + cn_img, cn_stop, cn_weight = task + cn_img = HWC3(cn_img) + + # https://github.com/tencent-ailab/IP-Adapter/blob/d580c50a291566bbf9fc7ac0f760506607297e6d/README.md?plain=1#L75 + cn_img = resize_image(cn_img, width=224, height=224, resize_mode=0) + + task[0] = ip_adapter.preprocess(cn_img, ip_adapter_path=ip_adapter_path) + if debugging_cn_preprocessor: + yield_result(async_task, cn_img, do_not_show_finished_images=True) + return + for task in cn_tasks[flags.cn_ip_face]: + cn_img, cn_stop, cn_weight = task + cn_img = HWC3(cn_img) + + if not skipping_cn_preprocessor: + cn_img = extras.face_crop.crop_image(cn_img) + + # https://github.com/tencent-ailab/IP-Adapter/blob/d580c50a291566bbf9fc7ac0f760506607297e6d/README.md?plain=1#L75 + cn_img = resize_image(cn_img, width=224, height=224, resize_mode=0) + + task[0] = ip_adapter.preprocess(cn_img, ip_adapter_path=ip_adapter_face_path) + if debugging_cn_preprocessor: + yield_result(async_task, cn_img, do_not_show_finished_images=True) + return + + all_ip_tasks = cn_tasks[flags.cn_ip] + cn_tasks[flags.cn_ip_face] + + if len(all_ip_tasks) > 0: + pipeline.final_unet = ip_adapter.patch_model(pipeline.final_unet, all_ip_tasks) + + if freeu_enabled: + print(f'FreeU is enabled!') + pipeline.final_unet = core.apply_freeu( + pipeline.final_unet, + freeu_b1, + freeu_b2, + freeu_s1, + freeu_s2 + ) + + all_steps = steps * image_number + + print(f'[Parameters] Denoising Strength = {denoising_strength}') + + if isinstance(initial_latent, dict) and 'samples' in initial_latent: + log_shape = initial_latent['samples'].shape + else: + log_shape = f'Image Space {(height, width)}' + + print(f'[Parameters] Initial Latent shape: {log_shape}') + + preparation_time = time.perf_counter() - execution_start_time + print(f'Preparation time: {preparation_time:.2f} seconds') + + final_sampler_name = sampler_name + final_scheduler_name = scheduler_name + + if scheduler_name == 'lcm': + final_scheduler_name = 'sgm_uniform' + if pipeline.final_unet is not None: + pipeline.final_unet = core.opModelSamplingDiscrete.patch( + pipeline.final_unet, + sampling='lcm', + zsnr=False)[0] + if pipeline.final_refiner_unet is not None: + pipeline.final_refiner_unet = core.opModelSamplingDiscrete.patch( + pipeline.final_refiner_unet, + sampling='lcm', + zsnr=False)[0] + print('Using lcm scheduler.') + + async_task.yields.append(['preview', (13, 'Moving model to GPU ...', None)]) + + def callback(step, x0, x, total_steps, y): + done_steps = current_task_id * steps + step + async_task.yields.append(['preview', ( + int(15.0 + 85.0 * float(done_steps) / float(all_steps)), + f'Step {step}/{total_steps} in the {current_task_id + 1}{ordinal_suffix(current_task_id + 1)} Sampling', y)]) + + for current_task_id, task in enumerate(tasks): + execution_start_time = time.perf_counter() + + try: + if async_task.last_stop is not False: + ldm_patched.modules.model_management.interrupt_current_processing() + positive_cond, negative_cond = task['c'], task['uc'] + + if 'cn' in goals: + for cn_flag, cn_path in [ + (flags.cn_canny, controlnet_canny_path), + (flags.cn_cpds, controlnet_cpds_path) + ]: + for cn_img, cn_stop, cn_weight in cn_tasks[cn_flag]: + positive_cond, negative_cond = core.apply_controlnet( + positive_cond, negative_cond, + pipeline.loaded_ControlNets[cn_path], cn_img, cn_weight, 0, cn_stop) + + imgs = pipeline.process_diffusion( + positive_cond=positive_cond, + negative_cond=negative_cond, + steps=steps, + switch=switch, + width=width, + height=height, + image_seed=task['task_seed'], + callback=callback, + sampler_name=final_sampler_name, + scheduler_name=final_scheduler_name, + latent=initial_latent, + denoise=denoising_strength, + tiled=tiled, + cfg_scale=cfg_scale, + refiner_swap_method=refiner_swap_method, + disable_preview=disable_preview + ) + + del task['c'], task['uc'], positive_cond, negative_cond # Save memory + + if inpaint_worker.current_task is not None: + imgs = [inpaint_worker.current_task.post_process(x) for x in imgs] + + img_paths = [] + for x in imgs: + d = [('Prompt', 'prompt', task['log_positive_prompt']), + ('Negative Prompt', 'negative_prompt', task['log_negative_prompt']), + ('Fooocus V2 Expansion', 'prompt_expansion', task['expansion']), + ('Styles', 'styles', str(raw_style_selections)), + ('Performance', 'performance', performance_selection.value)] + + if performance_selection.steps() != steps: + d.append(('Steps', 'steps', steps)) + + d += [('Resolution', 'resolution', str((width, height))), + ('Guidance Scale', 'guidance_scale', guidance_scale), + ('Sharpness', 'sharpness', sharpness), + ('ADM Guidance', 'adm_guidance', str(( + modules.patch.patch_settings[pid].positive_adm_scale, + modules.patch.patch_settings[pid].negative_adm_scale, + modules.patch.patch_settings[pid].adm_scaler_end))), + ('Base Model', 'base_model', base_model_name), + ('Refiner Model', 'refiner_model', refiner_model_name), + ('Refiner Switch', 'refiner_switch', refiner_switch)] + + if refiner_model_name != 'None': + if overwrite_switch > 0: + d.append(('Overwrite Switch', 'overwrite_switch', overwrite_switch)) + if refiner_swap_method != flags.refiner_swap_method: + d.append(('Refiner Swap Method', 'refiner_swap_method', refiner_swap_method)) + if modules.patch.patch_settings[pid].adaptive_cfg != modules.config.default_cfg_tsnr: + d.append(('CFG Mimicking from TSNR', 'adaptive_cfg', modules.patch.patch_settings[pid].adaptive_cfg)) + + d.append(('Sampler', 'sampler', sampler_name)) + d.append(('Scheduler', 'scheduler', scheduler_name)) + d.append(('Seed', 'seed', task['task_seed'])) + + if freeu_enabled: + d.append(('FreeU', 'freeu', str((freeu_b1, freeu_b2, freeu_s1, freeu_s2)))) + + for li, (n, w) in enumerate(loras): + if n != 'None': + d.append((f'LoRA {li + 1}', f'lora_combined_{li + 1}', f'{n} : {w}')) + + metadata_parser = None + if save_metadata_to_images: + metadata_parser = modules.meta_parser.get_metadata_parser(metadata_scheme) + metadata_parser.set_data(task['log_positive_prompt'], task['positive'], + task['log_negative_prompt'], task['negative'], + steps, base_model_name, refiner_model_name, loras) + d.append(('Metadata Scheme', 'metadata_scheme', metadata_scheme.value if save_metadata_to_images else save_metadata_to_images)) + d.append(('Version', 'version', 'Fooocus v' + fooocus_version.version)) + img_paths.append(log(x, d, metadata_parser, output_format)) + + yield_result(async_task, img_paths, do_not_show_finished_images=len(tasks) == 1 or disable_intermediate_results) + except ldm_patched.modules.model_management.InterruptProcessingException as e: + if async_task.last_stop == 'skip': + print('User skipped') + async_task.last_stop = False + continue + else: + print('User stopped') + break + + execution_time = time.perf_counter() - execution_start_time + print(f'Generating and saving time: {execution_time:.2f} seconds') + async_task.processing = False + return + + while True: + time.sleep(0.01) + if len(async_tasks) > 0: + task = async_tasks.pop(0) + generate_image_grid = task.args.pop(0) + + try: + handler(task) + if generate_image_grid: + build_image_wall(task) + task.yields.append(['finish', task.results]) + pipeline.prepare_text_encoder(async_call=True) + except: + traceback.print_exc() + task.yields.append(['finish', task.results]) + finally: + if pid in modules.patch.patch_settings: + del modules.patch.patch_settings[pid] + pass + + +threading.Thread(target=worker, daemon=True).start() diff --git a/modules/auth.py b/modules/auth.py new file mode 100644 index 0000000000000000000000000000000000000000..3ba111424523c19174f8b741b3bbac7b43b7bb6c --- /dev/null +++ b/modules/auth.py @@ -0,0 +1,41 @@ +import json +import hashlib +import modules.constants as constants + +from os.path import exists + + +def auth_list_to_dict(auth_list): + auth_dict = {} + for auth_data in auth_list: + if 'user' in auth_data: + if 'hash' in auth_data: + auth_dict |= {auth_data['user']: auth_data['hash']} + elif 'pass' in auth_data: + auth_dict |= {auth_data['user']: hashlib.sha256(bytes(auth_data['pass'], encoding='utf-8')).hexdigest()} + return auth_dict + + +def load_auth_data(filename=None): + auth_dict = None + if filename != None and exists(filename): + with open(filename, encoding='utf-8') as auth_file: + try: + auth_obj = json.load(auth_file) + if isinstance(auth_obj, list) and len(auth_obj) > 0: + auth_dict = auth_list_to_dict(auth_obj) + except Exception as e: + print('load_auth_data, e: ' + str(e)) + return auth_dict + + +auth_dict = load_auth_data(constants.AUTH_FILENAME) + +auth_enabled = auth_dict != None + + +def check_auth(user, password): + if user not in auth_dict: + return False + else: + return hashlib.sha256(bytes(password, encoding='utf-8')).hexdigest() == auth_dict[user] diff --git a/modules/config.py b/modules/config.py new file mode 100644 index 0000000000000000000000000000000000000000..60ee1868fdda85ab63f0a8533b6999c0e1590d13 --- /dev/null +++ b/modules/config.py @@ -0,0 +1,607 @@ +import os +import json +import math +import numbers +import args_manager +import modules.flags +import modules.sdxl_styles + +from modules.model_loader import load_file_from_url +from modules.util import get_files_from_folder, makedirs_with_log +from modules.flags import Performance, MetadataScheme + +def get_config_path(key, default_value): + env = os.getenv(key) + if env is not None and isinstance(env, str): + print(f"Environment: {key} = {env}") + return env + else: + return os.path.abspath(default_value) + +config_path = get_config_path('config_path', "./config.txt") +config_example_path = get_config_path('config_example_path', "config_modification_tutorial.txt") +config_dict = {} +always_save_keys = [] +visited_keys = [] + +try: + with open(os.path.abspath(f'./presets/default.json'), "r", encoding="utf-8") as json_file: + config_dict.update(json.load(json_file)) +except Exception as e: + print(f'Load default preset failed.') + print(e) + +try: + if os.path.exists(config_path): + with open(config_path, "r", encoding="utf-8") as json_file: + config_dict.update(json.load(json_file)) + always_save_keys = list(config_dict.keys()) +except Exception as e: + print(f'Failed to load config file "{config_path}" . The reason is: {str(e)}') + print('Please make sure that:') + print(f'1. The file "{config_path}" is a valid text file, and you have access to read it.') + print('2. Use "\\\\" instead of "\\" when describing paths.') + print('3. There is no "," before the last "}".') + print('4. All key/value formats are correct.') + + +def try_load_deprecated_user_path_config(): + global config_dict + + if not os.path.exists('user_path_config.txt'): + return + + try: + deprecated_config_dict = json.load(open('user_path_config.txt', "r", encoding="utf-8")) + + def replace_config(old_key, new_key): + if old_key in deprecated_config_dict: + config_dict[new_key] = deprecated_config_dict[old_key] + del deprecated_config_dict[old_key] + + replace_config('modelfile_path', 'path_checkpoints') + replace_config('lorafile_path', 'path_loras') + replace_config('embeddings_path', 'path_embeddings') + replace_config('vae_approx_path', 'path_vae_approx') + replace_config('upscale_models_path', 'path_upscale_models') + replace_config('inpaint_models_path', 'path_inpaint') + replace_config('controlnet_models_path', 'path_controlnet') + replace_config('clip_vision_models_path', 'path_clip_vision') + replace_config('fooocus_expansion_path', 'path_fooocus_expansion') + replace_config('temp_outputs_path', 'path_outputs') + + if deprecated_config_dict.get("default_model", None) == 'juggernautXL_version6Rundiffusion.safetensors': + os.replace('user_path_config.txt', 'user_path_config-deprecated.txt') + print('Config updated successfully in silence. ' + 'A backup of previous config is written to "user_path_config-deprecated.txt".') + return + + if input("Newer models and configs are available. " + "Download and update files? [Y/n]:") in ['n', 'N', 'No', 'no', 'NO']: + config_dict.update(deprecated_config_dict) + print('Loading using deprecated old models and deprecated old configs.') + return + else: + os.replace('user_path_config.txt', 'user_path_config-deprecated.txt') + print('Config updated successfully by user. ' + 'A backup of previous config is written to "user_path_config-deprecated.txt".') + return + except Exception as e: + print('Processing deprecated config failed') + print(e) + return + + +try_load_deprecated_user_path_config() + +preset = args_manager.args.preset + +if isinstance(preset, str): + preset_path = os.path.abspath(f'./presets/{preset}.json') + try: + if os.path.exists(preset_path): + with open(preset_path, "r", encoding="utf-8") as json_file: + config_dict.update(json.load(json_file)) + print(f'Loaded preset: {preset_path}') + else: + raise FileNotFoundError + except Exception as e: + print(f'Load preset [{preset_path}] failed') + print(e) + + +def get_path_output() -> str: + """ + Checking output path argument and overriding default path. + """ + global config_dict + path_output = get_dir_or_set_default('path_outputs', '../outputs/', make_directory=True) + if args_manager.args.output_path: + print(f'[CONFIG] Overriding config value path_outputs with {args_manager.args.output_path}') + config_dict['path_outputs'] = path_output = args_manager.args.output_path + return path_output + + +def get_dir_or_set_default(key, default_value, as_array=False, make_directory=False): + global config_dict, visited_keys, always_save_keys + + if key not in visited_keys: + visited_keys.append(key) + + if key not in always_save_keys: + always_save_keys.append(key) + + v = os.getenv(key) + if v is not None: + print(f"Environment: {key} = {v}") + config_dict[key] = v + else: + v = config_dict.get(key, None) + + if isinstance(v, str): + if make_directory: + makedirs_with_log(v) + if os.path.exists(v) and os.path.isdir(v): + return v if not as_array else [v] + elif isinstance(v, list): + if make_directory: + for d in v: + makedirs_with_log(d) + if all([os.path.exists(d) and os.path.isdir(d) for d in v]): + return v + + if v is not None: + print(f'Failed to load config key: {json.dumps({key:v})} is invalid or does not exist; will use {json.dumps({key:default_value})} instead.') + if isinstance(default_value, list): + dp = [] + for path in default_value: + abs_path = os.path.abspath(os.path.join(os.path.dirname(__file__), path)) + dp.append(abs_path) + os.makedirs(abs_path, exist_ok=True) + else: + dp = os.path.abspath(os.path.join(os.path.dirname(__file__), default_value)) + os.makedirs(dp, exist_ok=True) + if as_array: + dp = [dp] + config_dict[key] = dp + return dp + + +paths_checkpoints = get_dir_or_set_default('path_checkpoints', ['../models/checkpoints/'], True) +paths_loras = get_dir_or_set_default('path_loras', ['../models/loras/'], True) +path_embeddings = get_dir_or_set_default('path_embeddings', '../models/embeddings/') +path_vae_approx = get_dir_or_set_default('path_vae_approx', '../models/vae_approx/') +path_upscale_models = get_dir_or_set_default('path_upscale_models', '../models/upscale_models/') +path_inpaint = get_dir_or_set_default('path_inpaint', '../models/inpaint/') +path_controlnet = get_dir_or_set_default('path_controlnet', '../models/controlnet/') +path_clip_vision = get_dir_or_set_default('path_clip_vision', '../models/clip_vision/') +path_fooocus_expansion = get_dir_or_set_default('path_fooocus_expansion', '../models/prompt_expansion/fooocus_expansion') +path_outputs = get_path_output() + +def get_config_item_or_set_default(key, default_value, validator, disable_empty_as_none=False): + global config_dict, visited_keys + + if key not in visited_keys: + visited_keys.append(key) + + v = os.getenv(key) + if v is not None: + print(f"Environment: {key} = {v}") + config_dict[key] = v + + if key not in config_dict: + config_dict[key] = default_value + return default_value + + v = config_dict.get(key, None) + if not disable_empty_as_none: + if v is None or v == '': + v = 'None' + if validator(v): + return v + else: + if v is not None: + print(f'Failed to load config key: {json.dumps({key:v})} is invalid; will use {json.dumps({key:default_value})} instead.') + config_dict[key] = default_value + return default_value + + +default_base_model_name = get_config_item_or_set_default( + key='default_model', + default_value='model.safetensors', + validator=lambda x: isinstance(x, str) +) +previous_default_models = get_config_item_or_set_default( + key='previous_default_models', + default_value=[], + validator=lambda x: isinstance(x, list) and all(isinstance(k, str) for k in x) +) +default_refiner_model_name = get_config_item_or_set_default( + key='default_refiner', + default_value='None', + validator=lambda x: isinstance(x, str) +) +default_refiner_switch = get_config_item_or_set_default( + key='default_refiner_switch', + default_value=0.8, + validator=lambda x: isinstance(x, numbers.Number) and 0 <= x <= 1 +) +default_loras_min_weight = get_config_item_or_set_default( + key='default_loras_min_weight', + default_value=-2, + validator=lambda x: isinstance(x, numbers.Number) and -10 <= x <= 10 +) +default_loras_max_weight = get_config_item_or_set_default( + key='default_loras_max_weight', + default_value=2, + validator=lambda x: isinstance(x, numbers.Number) and -10 <= x <= 10 +) +default_loras = get_config_item_or_set_default( + key='default_loras', + default_value=[ + [ + "None", + 1.0 + ], + [ + "None", + 1.0 + ], + [ + "None", + 1.0 + ], + [ + "None", + 1.0 + ], + [ + "None", + 1.0 + ] + ], + validator=lambda x: isinstance(x, list) and all(len(y) == 2 and isinstance(y[0], str) and isinstance(y[1], numbers.Number) for y in x) +) +default_max_lora_number = get_config_item_or_set_default( + key='default_max_lora_number', + default_value=len(default_loras) if isinstance(default_loras, list) and len(default_loras) > 0 else 5, + validator=lambda x: isinstance(x, int) and x >= 1 +) +default_cfg_scale = get_config_item_or_set_default( + key='default_cfg_scale', + default_value=7.0, + validator=lambda x: isinstance(x, numbers.Number) +) +default_sample_sharpness = get_config_item_or_set_default( + key='default_sample_sharpness', + default_value=2.0, + validator=lambda x: isinstance(x, numbers.Number) +) +default_sampler = get_config_item_or_set_default( + key='default_sampler', + default_value='dpmpp_2m_sde_gpu', + validator=lambda x: x in modules.flags.sampler_list +) +default_scheduler = get_config_item_or_set_default( + key='default_scheduler', + default_value='karras', + validator=lambda x: x in modules.flags.scheduler_list +) +default_styles = get_config_item_or_set_default( + key='default_styles', + default_value=[ + "Fooocus V2", + "Fooocus Enhance", + "Fooocus Sharp" + ], + validator=lambda x: isinstance(x, list) and all(y in modules.sdxl_styles.legal_style_names for y in x) +) +default_prompt_negative = get_config_item_or_set_default( + key='default_prompt_negative', + default_value='', + validator=lambda x: isinstance(x, str), + disable_empty_as_none=True +) +default_prompt = get_config_item_or_set_default( + key='default_prompt', + default_value='', + validator=lambda x: isinstance(x, str), + disable_empty_as_none=True +) +default_performance = get_config_item_or_set_default( + key='default_performance', + default_value=Performance.SPEED.value, + validator=lambda x: x in Performance.list() +) +default_advanced_checkbox = get_config_item_or_set_default( + key='default_advanced_checkbox', + default_value=False, + validator=lambda x: isinstance(x, bool) +) +default_max_image_number = get_config_item_or_set_default( + key='default_max_image_number', + default_value=32, + validator=lambda x: isinstance(x, int) and x >= 1 +) +default_output_format = get_config_item_or_set_default( + key='default_output_format', + default_value='png', + validator=lambda x: x in modules.flags.output_formats +) +default_image_number = get_config_item_or_set_default( + key='default_image_number', + default_value=2, + validator=lambda x: isinstance(x, int) and 1 <= x <= default_max_image_number +) +checkpoint_downloads = get_config_item_or_set_default( + key='checkpoint_downloads', + default_value={}, + validator=lambda x: isinstance(x, dict) and all(isinstance(k, str) and isinstance(v, str) for k, v in x.items()) +) +lora_downloads = get_config_item_or_set_default( + key='lora_downloads', + default_value={}, + validator=lambda x: isinstance(x, dict) and all(isinstance(k, str) and isinstance(v, str) for k, v in x.items()) +) +embeddings_downloads = get_config_item_or_set_default( + key='embeddings_downloads', + default_value={}, + validator=lambda x: isinstance(x, dict) and all(isinstance(k, str) and isinstance(v, str) for k, v in x.items()) +) +available_aspect_ratios = get_config_item_or_set_default( + key='available_aspect_ratios', + default_value=[ + '704*1408', '704*1344', '768*1344', '768*1280', '832*1216', '832*1152', + '896*1152', '896*1088', '960*1088', '960*1024', '1024*1024', '1024*960', + '1088*960', '1088*896', '1152*896', '1152*832', '1216*832', '1280*768', + '1344*768', '1344*704', '1408*704', '1472*704', '1536*640', '1600*640', + '1664*576', '1728*576' + ], + validator=lambda x: isinstance(x, list) and all('*' in v for v in x) and len(x) > 1 +) +default_aspect_ratio = get_config_item_or_set_default( + key='default_aspect_ratio', + default_value='1152*896' if '1152*896' in available_aspect_ratios else available_aspect_ratios[0], + validator=lambda x: x in available_aspect_ratios +) +default_inpaint_engine_version = get_config_item_or_set_default( + key='default_inpaint_engine_version', + default_value='v2.6', + validator=lambda x: x in modules.flags.inpaint_engine_versions +) +default_cfg_tsnr = get_config_item_or_set_default( + key='default_cfg_tsnr', + default_value=7.0, + validator=lambda x: isinstance(x, numbers.Number) +) +default_overwrite_step = get_config_item_or_set_default( + key='default_overwrite_step', + default_value=-1, + validator=lambda x: isinstance(x, int) +) +default_overwrite_switch = get_config_item_or_set_default( + key='default_overwrite_switch', + default_value=-1, + validator=lambda x: isinstance(x, int) +) +example_inpaint_prompts = get_config_item_or_set_default( + key='example_inpaint_prompts', + default_value=[ + 'highly detailed face', 'detailed girl face', 'detailed man face', 'detailed hand', 'beautiful eyes' + ], + validator=lambda x: isinstance(x, list) and all(isinstance(v, str) for v in x) +) +default_save_metadata_to_images = get_config_item_or_set_default( + key='default_save_metadata_to_images', + default_value=True, + validator=lambda x: isinstance(x, bool) +) +default_metadata_scheme = get_config_item_or_set_default( + key='default_metadata_scheme', + default_value=MetadataScheme.FOOOCUS.value, + validator=lambda x: x in [y[1] for y in modules.flags.metadata_scheme if y[1] == x] +) +metadata_created_by = get_config_item_or_set_default( + key='metadata_created_by', + default_value='', + validator=lambda x: isinstance(x, str) +) + +example_inpaint_prompts = [[x] for x in example_inpaint_prompts] + +config_dict["default_loras"] = default_loras = default_loras[:default_max_lora_number] + [['None', 1.0] for _ in range(default_max_lora_number - len(default_loras))] + +possible_preset_keys = [ + "default_model", + "default_refiner", + "default_refiner_switch", + "default_loras_min_weight", + "default_loras_max_weight", + "default_loras", + "default_max_lora_number", + "default_cfg_scale", + "default_sample_sharpness", + "default_sampler", + "default_scheduler", + "default_performance", + "default_prompt", + "default_prompt_negative", + "default_styles", + "default_aspect_ratio", + "default_save_metadata_to_images", + "checkpoint_downloads", + "embeddings_downloads", + "lora_downloads", +] + + +REWRITE_PRESET = False + +if REWRITE_PRESET and isinstance(args_manager.args.preset, str): + save_path = 'presets/' + args_manager.args.preset + '.json' + with open(save_path, "w", encoding="utf-8") as json_file: + json.dump({k: config_dict[k] for k in possible_preset_keys}, json_file, indent=4) + print(f'Preset saved to {save_path}. Exiting ...') + exit(0) + + +def add_ratio(x): + a, b = x.replace('*', ' ').split(' ')[:2] + a, b = int(a), int(b) + g = math.gcd(a, b) + return f'{a}×{b} \U00002223 {a // g}:{b // g}' + + +default_aspect_ratio = add_ratio(default_aspect_ratio) +available_aspect_ratios = [add_ratio(x) for x in available_aspect_ratios] + + +# Only write config in the first launch. +if not os.path.exists(config_path): + with open(config_path, "w", encoding="utf-8") as json_file: + json.dump({k: config_dict[k] for k in always_save_keys}, json_file, indent=4) + + +# Always write tutorials. +with open(config_example_path, "w", encoding="utf-8") as json_file: + cpa = config_path.replace("\\", "\\\\") + json_file.write(f'You can modify your "{cpa}" using the below keys, formats, and examples.\n' + f'Do not modify this file. Modifications in this file will not take effect.\n' + f'This file is a tutorial and example. Please edit "{cpa}" to really change any settings.\n' + + 'Remember to split the paths with "\\\\" rather than "\\", ' + 'and there is no "," before the last "}". \n\n\n') + json.dump({k: config_dict[k] for k in visited_keys}, json_file, indent=4) + +model_filenames = [] +lora_filenames = [] +sdxl_lcm_lora = 'sdxl_lcm_lora.safetensors' + + +def get_model_filenames(folder_paths, name_filter=None): + extensions = ['.pth', '.ckpt', '.bin', '.safetensors', '.fooocus.patch'] + files = [] + for folder in folder_paths: + files += get_files_from_folder(folder, extensions, name_filter) + return files + + +def update_all_model_names(): + global model_filenames, lora_filenames + model_filenames = get_model_filenames(paths_checkpoints) + lora_filenames = get_model_filenames(paths_loras) + return + + +def downloading_inpaint_models(v): + assert v in modules.flags.inpaint_engine_versions + + load_file_from_url( + url='https://huggingface.co/lllyasviel/fooocus_inpaint/resolve/main/fooocus_inpaint_head.pth', + model_dir=path_inpaint, + file_name='fooocus_inpaint_head.pth' + ) + head_file = os.path.join(path_inpaint, 'fooocus_inpaint_head.pth') + patch_file = None + + if v == 'v1': + load_file_from_url( + url='https://huggingface.co/lllyasviel/fooocus_inpaint/resolve/main/inpaint.fooocus.patch', + model_dir=path_inpaint, + file_name='inpaint.fooocus.patch' + ) + patch_file = os.path.join(path_inpaint, 'inpaint.fooocus.patch') + + if v == 'v2.5': + load_file_from_url( + url='https://huggingface.co/lllyasviel/fooocus_inpaint/resolve/main/inpaint_v25.fooocus.patch', + model_dir=path_inpaint, + file_name='inpaint_v25.fooocus.patch' + ) + patch_file = os.path.join(path_inpaint, 'inpaint_v25.fooocus.patch') + + if v == 'v2.6': + load_file_from_url( + url='https://huggingface.co/lllyasviel/fooocus_inpaint/resolve/main/inpaint_v26.fooocus.patch', + model_dir=path_inpaint, + file_name='inpaint_v26.fooocus.patch' + ) + patch_file = os.path.join(path_inpaint, 'inpaint_v26.fooocus.patch') + + return head_file, patch_file + + +def downloading_sdxl_lcm_lora(): + load_file_from_url( + url='https://huggingface.co/lllyasviel/misc/resolve/main/sdxl_lcm_lora.safetensors', + model_dir=paths_loras[0], + file_name=sdxl_lcm_lora + ) + return sdxl_lcm_lora + + +def downloading_controlnet_canny(): + load_file_from_url( + url='https://huggingface.co/lllyasviel/misc/resolve/main/control-lora-canny-rank128.safetensors', + model_dir=path_controlnet, + file_name='control-lora-canny-rank128.safetensors' + ) + return os.path.join(path_controlnet, 'control-lora-canny-rank128.safetensors') + + +def downloading_controlnet_cpds(): + load_file_from_url( + url='https://huggingface.co/lllyasviel/misc/resolve/main/fooocus_xl_cpds_128.safetensors', + model_dir=path_controlnet, + file_name='fooocus_xl_cpds_128.safetensors' + ) + return os.path.join(path_controlnet, 'fooocus_xl_cpds_128.safetensors') + + +def downloading_ip_adapters(v): + assert v in ['ip', 'face'] + + results = [] + + load_file_from_url( + url='https://huggingface.co/lllyasviel/misc/resolve/main/clip_vision_vit_h.safetensors', + model_dir=path_clip_vision, + file_name='clip_vision_vit_h.safetensors' + ) + results += [os.path.join(path_clip_vision, 'clip_vision_vit_h.safetensors')] + + load_file_from_url( + url='https://huggingface.co/lllyasviel/misc/resolve/main/fooocus_ip_negative.safetensors', + model_dir=path_controlnet, + file_name='fooocus_ip_negative.safetensors' + ) + results += [os.path.join(path_controlnet, 'fooocus_ip_negative.safetensors')] + + if v == 'ip': + load_file_from_url( + url='https://huggingface.co/lllyasviel/misc/resolve/main/ip-adapter-plus_sdxl_vit-h.bin', + model_dir=path_controlnet, + file_name='ip-adapter-plus_sdxl_vit-h.bin' + ) + results += [os.path.join(path_controlnet, 'ip-adapter-plus_sdxl_vit-h.bin')] + + if v == 'face': + load_file_from_url( + url='https://huggingface.co/lllyasviel/misc/resolve/main/ip-adapter-plus-face_sdxl_vit-h.bin', + model_dir=path_controlnet, + file_name='ip-adapter-plus-face_sdxl_vit-h.bin' + ) + results += [os.path.join(path_controlnet, 'ip-adapter-plus-face_sdxl_vit-h.bin')] + + return results + + +def downloading_upscale_model(): + load_file_from_url( + url='https://huggingface.co/lllyasviel/misc/resolve/main/fooocus_upscaler_s409985e5.bin', + model_dir=path_upscale_models, + file_name='fooocus_upscaler_s409985e5.bin' + ) + return os.path.join(path_upscale_models, 'fooocus_upscaler_s409985e5.bin') + + +update_all_model_names() diff --git a/modules/constants.py b/modules/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..667fa8682306e192465f11733fc9814bacedfe89 --- /dev/null +++ b/modules/constants.py @@ -0,0 +1,5 @@ +# as in k-diffusion (sampling.py) +MIN_SEED = 0 +MAX_SEED = 2**63 - 1 + +AUTH_FILENAME = 'auth.json' diff --git a/modules/core.py b/modules/core.py new file mode 100644 index 0000000000000000000000000000000000000000..bfc449661d9c636e096b8e9555daa3bebb5f50e7 --- /dev/null +++ b/modules/core.py @@ -0,0 +1,339 @@ +import os +import einops +import torch +import numpy as np + +import ldm_patched.modules.model_management +import ldm_patched.modules.model_detection +import ldm_patched.modules.model_patcher +import ldm_patched.modules.utils +import ldm_patched.modules.controlnet +import modules.sample_hijack +import ldm_patched.modules.samplers +import ldm_patched.modules.latent_formats + +from ldm_patched.modules.sd import load_checkpoint_guess_config +from ldm_patched.contrib.external import VAEDecode, EmptyLatentImage, VAEEncode, VAEEncodeTiled, VAEDecodeTiled, \ + ControlNetApplyAdvanced +from ldm_patched.contrib.external_freelunch import FreeU_V2 +from ldm_patched.modules.sample import prepare_mask +from modules.lora import match_lora +from modules.util import get_file_from_folder_list +from ldm_patched.modules.lora import model_lora_keys_unet, model_lora_keys_clip +from modules.config import path_embeddings +from ldm_patched.contrib.external_model_advanced import ModelSamplingDiscrete + + +opEmptyLatentImage = EmptyLatentImage() +opVAEDecode = VAEDecode() +opVAEEncode = VAEEncode() +opVAEDecodeTiled = VAEDecodeTiled() +opVAEEncodeTiled = VAEEncodeTiled() +opControlNetApplyAdvanced = ControlNetApplyAdvanced() +opFreeU = FreeU_V2() +opModelSamplingDiscrete = ModelSamplingDiscrete() + + +class StableDiffusionModel: + def __init__(self, unet=None, vae=None, clip=None, clip_vision=None, filename=None): + self.unet = unet + self.vae = vae + self.clip = clip + self.clip_vision = clip_vision + self.filename = filename + self.unet_with_lora = unet + self.clip_with_lora = clip + self.visited_loras = '' + + self.lora_key_map_unet = {} + self.lora_key_map_clip = {} + + if self.unet is not None: + self.lora_key_map_unet = model_lora_keys_unet(self.unet.model, self.lora_key_map_unet) + self.lora_key_map_unet.update({x: x for x in self.unet.model.state_dict().keys()}) + + if self.clip is not None: + self.lora_key_map_clip = model_lora_keys_clip(self.clip.cond_stage_model, self.lora_key_map_clip) + self.lora_key_map_clip.update({x: x for x in self.clip.cond_stage_model.state_dict().keys()}) + + @torch.no_grad() + @torch.inference_mode() + def refresh_loras(self, loras): + assert isinstance(loras, list) + + if self.visited_loras == str(loras): + return + + self.visited_loras = str(loras) + + if self.unet is None: + return + + print(f'Request to load LoRAs {str(loras)} for model [{self.filename}].') + + loras_to_load = [] + + for name, weight in loras: + if name == 'None': + continue + + if os.path.exists(name): + lora_filename = name + else: + lora_filename = get_file_from_folder_list(name, modules.config.paths_loras) + + if not os.path.exists(lora_filename): + print(f'Lora file not found: {lora_filename}') + continue + + loras_to_load.append((lora_filename, weight)) + + self.unet_with_lora = self.unet.clone() if self.unet is not None else None + self.clip_with_lora = self.clip.clone() if self.clip is not None else None + + for lora_filename, weight in loras_to_load: + lora_unmatch = ldm_patched.modules.utils.load_torch_file(lora_filename, safe_load=False) + lora_unet, lora_unmatch = match_lora(lora_unmatch, self.lora_key_map_unet) + lora_clip, lora_unmatch = match_lora(lora_unmatch, self.lora_key_map_clip) + + if len(lora_unmatch) > 12: + # model mismatch + continue + + if len(lora_unmatch) > 0: + print(f'Loaded LoRA [{lora_filename}] for model [{self.filename}] ' + f'with unmatched keys {list(lora_unmatch.keys())}') + + if self.unet_with_lora is not None and len(lora_unet) > 0: + loaded_keys = self.unet_with_lora.add_patches(lora_unet, weight) + print(f'Loaded LoRA [{lora_filename}] for UNet [{self.filename}] ' + f'with {len(loaded_keys)} keys at weight {weight}.') + for item in lora_unet: + if item not in loaded_keys: + print("UNet LoRA key skipped: ", item) + + if self.clip_with_lora is not None and len(lora_clip) > 0: + loaded_keys = self.clip_with_lora.add_patches(lora_clip, weight) + print(f'Loaded LoRA [{lora_filename}] for CLIP [{self.filename}] ' + f'with {len(loaded_keys)} keys at weight {weight}.') + for item in lora_clip: + if item not in loaded_keys: + print("CLIP LoRA key skipped: ", item) + + +@torch.no_grad() +@torch.inference_mode() +def apply_freeu(model, b1, b2, s1, s2): + return opFreeU.patch(model=model, b1=b1, b2=b2, s1=s1, s2=s2)[0] + + +@torch.no_grad() +@torch.inference_mode() +def load_controlnet(ckpt_filename): + return ldm_patched.modules.controlnet.load_controlnet(ckpt_filename) + + +@torch.no_grad() +@torch.inference_mode() +def apply_controlnet(positive, negative, control_net, image, strength, start_percent, end_percent): + return opControlNetApplyAdvanced.apply_controlnet(positive=positive, negative=negative, control_net=control_net, + image=image, strength=strength, start_percent=start_percent, end_percent=end_percent) + + +@torch.no_grad() +@torch.inference_mode() +def load_model(ckpt_filename): + unet, clip, vae, clip_vision = load_checkpoint_guess_config(ckpt_filename, embedding_directory=path_embeddings) + return StableDiffusionModel(unet=unet, clip=clip, vae=vae, clip_vision=clip_vision, filename=ckpt_filename) + + +@torch.no_grad() +@torch.inference_mode() +def generate_empty_latent(width=1024, height=1024, batch_size=1): + return opEmptyLatentImage.generate(width=width, height=height, batch_size=batch_size)[0] + + +@torch.no_grad() +@torch.inference_mode() +def decode_vae(vae, latent_image, tiled=False): + if tiled: + return opVAEDecodeTiled.decode(samples=latent_image, vae=vae, tile_size=512)[0] + else: + return opVAEDecode.decode(samples=latent_image, vae=vae)[0] + + +@torch.no_grad() +@torch.inference_mode() +def encode_vae(vae, pixels, tiled=False): + if tiled: + return opVAEEncodeTiled.encode(pixels=pixels, vae=vae, tile_size=512)[0] + else: + return opVAEEncode.encode(pixels=pixels, vae=vae)[0] + + +@torch.no_grad() +@torch.inference_mode() +def encode_vae_inpaint(vae, pixels, mask): + assert mask.ndim == 3 and pixels.ndim == 4 + assert mask.shape[-1] == pixels.shape[-2] + assert mask.shape[-2] == pixels.shape[-3] + + w = mask.round()[..., None] + pixels = pixels * (1 - w) + 0.5 * w + + latent = vae.encode(pixels) + B, C, H, W = latent.shape + + latent_mask = mask[:, None, :, :] + latent_mask = torch.nn.functional.interpolate(latent_mask, size=(H * 8, W * 8), mode="bilinear").round() + latent_mask = torch.nn.functional.max_pool2d(latent_mask, (8, 8)).round().to(latent) + + return latent, latent_mask + + +class VAEApprox(torch.nn.Module): + def __init__(self): + super(VAEApprox, self).__init__() + self.conv1 = torch.nn.Conv2d(4, 8, (7, 7)) + self.conv2 = torch.nn.Conv2d(8, 16, (5, 5)) + self.conv3 = torch.nn.Conv2d(16, 32, (3, 3)) + self.conv4 = torch.nn.Conv2d(32, 64, (3, 3)) + self.conv5 = torch.nn.Conv2d(64, 32, (3, 3)) + self.conv6 = torch.nn.Conv2d(32, 16, (3, 3)) + self.conv7 = torch.nn.Conv2d(16, 8, (3, 3)) + self.conv8 = torch.nn.Conv2d(8, 3, (3, 3)) + self.current_type = None + + def forward(self, x): + extra = 11 + x = torch.nn.functional.interpolate(x, (x.shape[2] * 2, x.shape[3] * 2)) + x = torch.nn.functional.pad(x, (extra, extra, extra, extra)) + for layer in [self.conv1, self.conv2, self.conv3, self.conv4, self.conv5, self.conv6, self.conv7, self.conv8]: + x = layer(x) + x = torch.nn.functional.leaky_relu(x, 0.1) + return x + + +VAE_approx_models = {} + + +@torch.no_grad() +@torch.inference_mode() +def get_previewer(model): + global VAE_approx_models + + from modules.config import path_vae_approx + is_sdxl = isinstance(model.model.latent_format, ldm_patched.modules.latent_formats.SDXL) + vae_approx_filename = os.path.join(path_vae_approx, 'xlvaeapp.pth' if is_sdxl else 'vaeapp_sd15.pth') + + if vae_approx_filename in VAE_approx_models: + VAE_approx_model = VAE_approx_models[vae_approx_filename] + else: + sd = torch.load(vae_approx_filename, map_location='cpu') + VAE_approx_model = VAEApprox() + VAE_approx_model.load_state_dict(sd) + del sd + VAE_approx_model.eval() + + if ldm_patched.modules.model_management.should_use_fp16(): + VAE_approx_model.half() + VAE_approx_model.current_type = torch.float16 + else: + VAE_approx_model.float() + VAE_approx_model.current_type = torch.float32 + + VAE_approx_model.to(ldm_patched.modules.model_management.get_torch_device()) + VAE_approx_models[vae_approx_filename] = VAE_approx_model + + @torch.no_grad() + @torch.inference_mode() + def preview_function(x0, step, total_steps): + with torch.no_grad(): + x_sample = x0.to(VAE_approx_model.current_type) + x_sample = VAE_approx_model(x_sample) * 127.5 + 127.5 + x_sample = einops.rearrange(x_sample, 'b c h w -> b h w c')[0] + x_sample = x_sample.cpu().numpy().clip(0, 255).astype(np.uint8) + return x_sample + + return preview_function + + +@torch.no_grad() +@torch.inference_mode() +def ksampler(model, positive, negative, latent, seed=None, steps=30, cfg=7.0, sampler_name='dpmpp_2m_sde_gpu', + scheduler='karras', denoise=1.0, disable_noise=False, start_step=None, last_step=None, + force_full_denoise=False, callback_function=None, refiner=None, refiner_switch=-1, + previewer_start=None, previewer_end=None, sigmas=None, noise_mean=None, disable_preview=False): + + if sigmas is not None: + sigmas = sigmas.clone().to(ldm_patched.modules.model_management.get_torch_device()) + + latent_image = latent["samples"] + + if disable_noise: + noise = torch.zeros(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, device="cpu") + else: + batch_inds = latent["batch_index"] if "batch_index" in latent else None + noise = ldm_patched.modules.sample.prepare_noise(latent_image, seed, batch_inds) + + if isinstance(noise_mean, torch.Tensor): + noise = noise + noise_mean - torch.mean(noise, dim=1, keepdim=True) + + noise_mask = None + if "noise_mask" in latent: + noise_mask = latent["noise_mask"] + + previewer = get_previewer(model) + + if previewer_start is None: + previewer_start = 0 + + if previewer_end is None: + previewer_end = steps + + def callback(step, x0, x, total_steps): + ldm_patched.modules.model_management.throw_exception_if_processing_interrupted() + y = None + if previewer is not None and not disable_preview: + y = previewer(x0, previewer_start + step, previewer_end) + if callback_function is not None: + callback_function(previewer_start + step, x0, x, previewer_end, y) + + disable_pbar = False + modules.sample_hijack.current_refiner = refiner + modules.sample_hijack.refiner_switch_step = refiner_switch + ldm_patched.modules.samplers.sample = modules.sample_hijack.sample_hacked + + try: + samples = ldm_patched.modules.sample.sample(model, + noise, steps, cfg, sampler_name, scheduler, + positive, negative, latent_image, + denoise=denoise, disable_noise=disable_noise, + start_step=start_step, + last_step=last_step, + force_full_denoise=force_full_denoise, noise_mask=noise_mask, + callback=callback, + disable_pbar=disable_pbar, seed=seed, sigmas=sigmas) + + out = latent.copy() + out["samples"] = samples + finally: + modules.sample_hijack.current_refiner = None + + return out + + +@torch.no_grad() +@torch.inference_mode() +def pytorch_to_numpy(x): + return [np.clip(255. * y.cpu().numpy(), 0, 255).astype(np.uint8) for y in x] + + +@torch.no_grad() +@torch.inference_mode() +def numpy_to_pytorch(x): + y = x.astype(np.float32) / 255.0 + y = y[None] + y = np.ascontiguousarray(y.copy()) + y = torch.from_numpy(y).float() + return y diff --git a/modules/default_pipeline.py b/modules/default_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..f8edfae105fa45a103f9e1463e6abbee2c19444c --- /dev/null +++ b/modules/default_pipeline.py @@ -0,0 +1,498 @@ +import modules.core as core +import os +import torch +import modules.patch +import modules.config +import ldm_patched.modules.model_management +import ldm_patched.modules.latent_formats +import modules.inpaint_worker +import extras.vae_interpose as vae_interpose +from extras.expansion import FooocusExpansion + +from ldm_patched.modules.model_base import SDXL, SDXLRefiner +from modules.sample_hijack import clip_separate +from modules.util import get_file_from_folder_list + + +model_base = core.StableDiffusionModel() +model_refiner = core.StableDiffusionModel() + +final_expansion = None +final_unet = None +final_clip = None +final_vae = None +final_refiner_unet = None +final_refiner_vae = None + +loaded_ControlNets = {} + + +@torch.no_grad() +@torch.inference_mode() +def refresh_controlnets(model_paths): + global loaded_ControlNets + cache = {} + for p in model_paths: + if p is not None: + if p in loaded_ControlNets: + cache[p] = loaded_ControlNets[p] + else: + cache[p] = core.load_controlnet(p) + loaded_ControlNets = cache + return + + +@torch.no_grad() +@torch.inference_mode() +def assert_model_integrity(): + error_message = None + + if not isinstance(model_base.unet_with_lora.model, SDXL): + error_message = 'You have selected base model other than SDXL. This is not supported yet.' + + if error_message is not None: + raise NotImplementedError(error_message) + + return True + + +@torch.no_grad() +@torch.inference_mode() +def refresh_base_model(name): + global model_base + + filename = get_file_from_folder_list(name, modules.config.paths_checkpoints) + + if model_base.filename == filename: + return + + model_base = core.StableDiffusionModel() + model_base = core.load_model(filename) + print(f'Base model loaded: {model_base.filename}') + return + + +@torch.no_grad() +@torch.inference_mode() +def refresh_refiner_model(name): + global model_refiner + + filename = get_file_from_folder_list(name, modules.config.paths_checkpoints) + + if model_refiner.filename == filename: + return + + model_refiner = core.StableDiffusionModel() + + if name == 'None': + print(f'Refiner unloaded.') + return + + model_refiner = core.load_model(filename) + print(f'Refiner model loaded: {model_refiner.filename}') + + if isinstance(model_refiner.unet.model, SDXL): + model_refiner.clip = None + model_refiner.vae = None + elif isinstance(model_refiner.unet.model, SDXLRefiner): + model_refiner.clip = None + model_refiner.vae = None + else: + model_refiner.clip = None + + return + + +@torch.no_grad() +@torch.inference_mode() +def synthesize_refiner_model(): + global model_base, model_refiner + + print('Synthetic Refiner Activated') + model_refiner = core.StableDiffusionModel( + unet=model_base.unet, + vae=model_base.vae, + clip=model_base.clip, + clip_vision=model_base.clip_vision, + filename=model_base.filename + ) + model_refiner.vae = None + model_refiner.clip = None + model_refiner.clip_vision = None + + return + + +@torch.no_grad() +@torch.inference_mode() +def refresh_loras(loras, base_model_additional_loras=None): + global model_base, model_refiner + + if not isinstance(base_model_additional_loras, list): + base_model_additional_loras = [] + + model_base.refresh_loras(loras + base_model_additional_loras) + model_refiner.refresh_loras(loras) + + return + + +@torch.no_grad() +@torch.inference_mode() +def clip_encode_single(clip, text, verbose=False): + cached = clip.fcs_cond_cache.get(text, None) + if cached is not None: + if verbose: + print(f'[CLIP Cached] {text}') + return cached + tokens = clip.tokenize(text) + result = clip.encode_from_tokens(tokens, return_pooled=True) + clip.fcs_cond_cache[text] = result + if verbose: + print(f'[CLIP Encoded] {text}') + return result + + +@torch.no_grad() +@torch.inference_mode() +def clone_cond(conds): + results = [] + + for c, p in conds: + p = p["pooled_output"] + + if isinstance(c, torch.Tensor): + c = c.clone() + + if isinstance(p, torch.Tensor): + p = p.clone() + + results.append([c, {"pooled_output": p}]) + + return results + + +@torch.no_grad() +@torch.inference_mode() +def clip_encode(texts, pool_top_k=1): + global final_clip + + if final_clip is None: + return None + if not isinstance(texts, list): + return None + if len(texts) == 0: + return None + + cond_list = [] + pooled_acc = 0 + + for i, text in enumerate(texts): + cond, pooled = clip_encode_single(final_clip, text) + cond_list.append(cond) + if i < pool_top_k: + pooled_acc += pooled + + return [[torch.cat(cond_list, dim=1), {"pooled_output": pooled_acc}]] + + +@torch.no_grad() +@torch.inference_mode() +def clear_all_caches(): + final_clip.fcs_cond_cache = {} + + +@torch.no_grad() +@torch.inference_mode() +def prepare_text_encoder(async_call=True): + if async_call: + # TODO: make sure that this is always called in an async way so that users cannot feel it. + pass + assert_model_integrity() + ldm_patched.modules.model_management.load_models_gpu([final_clip.patcher, final_expansion.patcher]) + return + + +@torch.no_grad() +@torch.inference_mode() +def refresh_everything(refiner_model_name, base_model_name, loras, + base_model_additional_loras=None, use_synthetic_refiner=False): + global final_unet, final_clip, final_vae, final_refiner_unet, final_refiner_vae, final_expansion + + final_unet = None + final_clip = None + final_vae = None + final_refiner_unet = None + final_refiner_vae = None + + if use_synthetic_refiner and refiner_model_name == 'None': + print('Synthetic Refiner Activated') + refresh_base_model(base_model_name) + synthesize_refiner_model() + else: + refresh_refiner_model(refiner_model_name) + refresh_base_model(base_model_name) + + refresh_loras(loras, base_model_additional_loras=base_model_additional_loras) + assert_model_integrity() + + final_unet = model_base.unet_with_lora + final_clip = model_base.clip_with_lora + final_vae = model_base.vae + + final_refiner_unet = model_refiner.unet_with_lora + final_refiner_vae = model_refiner.vae + + if final_expansion is None: + final_expansion = FooocusExpansion() + + prepare_text_encoder(async_call=True) + clear_all_caches() + return + + +refresh_everything( + refiner_model_name=modules.config.default_refiner_model_name, + base_model_name=modules.config.default_base_model_name, + loras=modules.config.default_loras +) + + +@torch.no_grad() +@torch.inference_mode() +def vae_parse(latent): + if final_refiner_vae is None: + return latent + + result = vae_interpose.parse(latent["samples"]) + return {'samples': result} + + +@torch.no_grad() +@torch.inference_mode() +def calculate_sigmas_all(sampler, model, scheduler, steps): + from ldm_patched.modules.samplers import calculate_sigmas_scheduler + + discard_penultimate_sigma = False + if sampler in ['dpm_2', 'dpm_2_ancestral']: + steps += 1 + discard_penultimate_sigma = True + + sigmas = calculate_sigmas_scheduler(model, scheduler, steps) + + if discard_penultimate_sigma: + sigmas = torch.cat([sigmas[:-2], sigmas[-1:]]) + return sigmas + + +@torch.no_grad() +@torch.inference_mode() +def calculate_sigmas(sampler, model, scheduler, steps, denoise): + if denoise is None or denoise > 0.9999: + sigmas = calculate_sigmas_all(sampler, model, scheduler, steps) + else: + new_steps = int(steps / denoise) + sigmas = calculate_sigmas_all(sampler, model, scheduler, new_steps) + sigmas = sigmas[-(steps + 1):] + return sigmas + + +@torch.no_grad() +@torch.inference_mode() +def get_candidate_vae(steps, switch, denoise=1.0, refiner_swap_method='joint'): + assert refiner_swap_method in ['joint', 'separate', 'vae'] + + if final_refiner_vae is not None and final_refiner_unet is not None: + if denoise > 0.9: + return final_vae, final_refiner_vae + else: + if denoise > (float(steps - switch) / float(steps)) ** 0.834: # karras 0.834 + return final_vae, None + else: + return final_refiner_vae, None + + return final_vae, final_refiner_vae + + +@torch.no_grad() +@torch.inference_mode() +def process_diffusion(positive_cond, negative_cond, steps, switch, width, height, image_seed, callback, sampler_name, scheduler_name, latent=None, denoise=1.0, tiled=False, cfg_scale=7.0, refiner_swap_method='joint', disable_preview=False): + target_unet, target_vae, target_refiner_unet, target_refiner_vae, target_clip \ + = final_unet, final_vae, final_refiner_unet, final_refiner_vae, final_clip + + assert refiner_swap_method in ['joint', 'separate', 'vae'] + + if final_refiner_vae is not None and final_refiner_unet is not None: + # Refiner Use Different VAE (then it is SD15) + if denoise > 0.9: + refiner_swap_method = 'vae' + else: + refiner_swap_method = 'joint' + if denoise > (float(steps - switch) / float(steps)) ** 0.834: # karras 0.834 + target_unet, target_vae, target_refiner_unet, target_refiner_vae \ + = final_unet, final_vae, None, None + print(f'[Sampler] only use Base because of partial denoise.') + else: + positive_cond = clip_separate(positive_cond, target_model=final_refiner_unet.model, target_clip=final_clip) + negative_cond = clip_separate(negative_cond, target_model=final_refiner_unet.model, target_clip=final_clip) + target_unet, target_vae, target_refiner_unet, target_refiner_vae \ + = final_refiner_unet, final_refiner_vae, None, None + print(f'[Sampler] only use Refiner because of partial denoise.') + + print(f'[Sampler] refiner_swap_method = {refiner_swap_method}') + + if latent is None: + initial_latent = core.generate_empty_latent(width=width, height=height, batch_size=1) + else: + initial_latent = latent + + minmax_sigmas = calculate_sigmas(sampler=sampler_name, scheduler=scheduler_name, model=final_unet.model, steps=steps, denoise=denoise) + sigma_min, sigma_max = minmax_sigmas[minmax_sigmas > 0].min(), minmax_sigmas.max() + sigma_min = float(sigma_min.cpu().numpy()) + sigma_max = float(sigma_max.cpu().numpy()) + print(f'[Sampler] sigma_min = {sigma_min}, sigma_max = {sigma_max}') + + modules.patch.BrownianTreeNoiseSamplerPatched.global_init( + initial_latent['samples'].to(ldm_patched.modules.model_management.get_torch_device()), + sigma_min, sigma_max, seed=image_seed, cpu=False) + + decoded_latent = None + + if refiner_swap_method == 'joint': + sampled_latent = core.ksampler( + model=target_unet, + refiner=target_refiner_unet, + positive=positive_cond, + negative=negative_cond, + latent=initial_latent, + steps=steps, start_step=0, last_step=steps, disable_noise=False, force_full_denoise=True, + seed=image_seed, + denoise=denoise, + callback_function=callback, + cfg=cfg_scale, + sampler_name=sampler_name, + scheduler=scheduler_name, + refiner_switch=switch, + previewer_start=0, + previewer_end=steps, + disable_preview=disable_preview + ) + decoded_latent = core.decode_vae(vae=target_vae, latent_image=sampled_latent, tiled=tiled) + + if refiner_swap_method == 'separate': + sampled_latent = core.ksampler( + model=target_unet, + positive=positive_cond, + negative=negative_cond, + latent=initial_latent, + steps=steps, start_step=0, last_step=switch, disable_noise=False, force_full_denoise=False, + seed=image_seed, + denoise=denoise, + callback_function=callback, + cfg=cfg_scale, + sampler_name=sampler_name, + scheduler=scheduler_name, + previewer_start=0, + previewer_end=steps, + disable_preview=disable_preview + ) + print('Refiner swapped by changing ksampler. Noise preserved.') + + target_model = target_refiner_unet + if target_model is None: + target_model = target_unet + print('Use base model to refine itself - this may because of developer mode.') + + sampled_latent = core.ksampler( + model=target_model, + positive=clip_separate(positive_cond, target_model=target_model.model, target_clip=target_clip), + negative=clip_separate(negative_cond, target_model=target_model.model, target_clip=target_clip), + latent=sampled_latent, + steps=steps, start_step=switch, last_step=steps, disable_noise=True, force_full_denoise=True, + seed=image_seed, + denoise=denoise, + callback_function=callback, + cfg=cfg_scale, + sampler_name=sampler_name, + scheduler=scheduler_name, + previewer_start=switch, + previewer_end=steps, + disable_preview=disable_preview + ) + + target_model = target_refiner_vae + if target_model is None: + target_model = target_vae + decoded_latent = core.decode_vae(vae=target_model, latent_image=sampled_latent, tiled=tiled) + + if refiner_swap_method == 'vae': + modules.patch.patch_settings[os.getpid()].eps_record = 'vae' + + if modules.inpaint_worker.current_task is not None: + modules.inpaint_worker.current_task.unswap() + + sampled_latent = core.ksampler( + model=target_unet, + positive=positive_cond, + negative=negative_cond, + latent=initial_latent, + steps=steps, start_step=0, last_step=switch, disable_noise=False, force_full_denoise=True, + seed=image_seed, + denoise=denoise, + callback_function=callback, + cfg=cfg_scale, + sampler_name=sampler_name, + scheduler=scheduler_name, + previewer_start=0, + previewer_end=steps, + disable_preview=disable_preview + ) + print('Fooocus VAE-based swap.') + + target_model = target_refiner_unet + if target_model is None: + target_model = target_unet + print('Use base model to refine itself - this may because of developer mode.') + + sampled_latent = vae_parse(sampled_latent) + + k_sigmas = 1.4 + sigmas = calculate_sigmas(sampler=sampler_name, + scheduler=scheduler_name, + model=target_model.model, + steps=steps, + denoise=denoise)[switch:] * k_sigmas + len_sigmas = len(sigmas) - 1 + + noise_mean = torch.mean(modules.patch.patch_settings[os.getpid()].eps_record, dim=1, keepdim=True) + + if modules.inpaint_worker.current_task is not None: + modules.inpaint_worker.current_task.swap() + + sampled_latent = core.ksampler( + model=target_model, + positive=clip_separate(positive_cond, target_model=target_model.model, target_clip=target_clip), + negative=clip_separate(negative_cond, target_model=target_model.model, target_clip=target_clip), + latent=sampled_latent, + steps=len_sigmas, start_step=0, last_step=len_sigmas, disable_noise=False, force_full_denoise=True, + seed=image_seed+1, + denoise=denoise, + callback_function=callback, + cfg=cfg_scale, + sampler_name=sampler_name, + scheduler=scheduler_name, + previewer_start=switch, + previewer_end=steps, + sigmas=sigmas, + noise_mean=noise_mean, + disable_preview=disable_preview + ) + + target_model = target_refiner_vae + if target_model is None: + target_model = target_vae + decoded_latent = core.decode_vae(vae=target_model, latent_image=sampled_latent, tiled=tiled) + + images = core.pytorch_to_numpy(decoded_latent) + modules.patch.patch_settings[os.getpid()].eps_record = None + return images diff --git a/modules/flags.py b/modules/flags.py new file mode 100644 index 0000000000000000000000000000000000000000..6f12bc8f3f27c4b9ae06f2ee7ac0a90e46122b16 --- /dev/null +++ b/modules/flags.py @@ -0,0 +1,125 @@ +from enum import IntEnum, Enum + +disabled = 'Disabled' +enabled = 'Enabled' +subtle_variation = 'Vary (Subtle)' +strong_variation = 'Vary (Strong)' +upscale_15 = 'Upscale (1.5x)' +upscale_2 = 'Upscale (2x)' +upscale_fast = 'Upscale (Fast 2x)' + +uov_list = [ + disabled, subtle_variation, strong_variation, upscale_15, upscale_2, upscale_fast +] + +CIVITAI_NO_KARRAS = ["euler", "euler_ancestral", "heun", "dpm_fast", "dpm_adaptive", "ddim", "uni_pc"] + +# fooocus: a1111 (Civitai) +KSAMPLER = { + "euler": "Euler", + "euler_ancestral": "Euler a", + "heun": "Heun", + "heunpp2": "", + "dpm_2": "DPM2", + "dpm_2_ancestral": "DPM2 a", + "lms": "LMS", + "dpm_fast": "DPM fast", + "dpm_adaptive": "DPM adaptive", + "dpmpp_2s_ancestral": "DPM++ 2S a", + "dpmpp_sde": "DPM++ SDE", + "dpmpp_sde_gpu": "DPM++ SDE", + "dpmpp_2m": "DPM++ 2M", + "dpmpp_2m_sde": "DPM++ 2M SDE", + "dpmpp_2m_sde_gpu": "DPM++ 2M SDE", + "dpmpp_3m_sde": "", + "dpmpp_3m_sde_gpu": "", + "ddpm": "", + "lcm": "LCM" +} + +SAMPLER_EXTRA = { + "ddim": "DDIM", + "uni_pc": "UniPC", + "uni_pc_bh2": "" +} + +SAMPLERS = KSAMPLER | SAMPLER_EXTRA + +KSAMPLER_NAMES = list(KSAMPLER.keys()) + +SCHEDULER_NAMES = ["normal", "karras", "exponential", "sgm_uniform", "simple", "ddim_uniform", "lcm", "turbo"] +SAMPLER_NAMES = KSAMPLER_NAMES + list(SAMPLER_EXTRA.keys()) + +sampler_list = SAMPLER_NAMES +scheduler_list = SCHEDULER_NAMES + +refiner_swap_method = 'joint' + +cn_ip = "ImagePrompt" +cn_ip_face = "FaceSwap" +cn_canny = "PyraCanny" +cn_cpds = "CPDS" + +ip_list = [cn_ip, cn_canny, cn_cpds, cn_ip_face] +default_ip = cn_ip + +default_parameters = { + cn_ip: (0.5, 0.6), cn_ip_face: (0.9, 0.75), cn_canny: (0.5, 1.0), cn_cpds: (0.5, 1.0) +} # stop, weight + +output_formats = ['png', 'jpg', 'webp'] + +inpaint_engine_versions = ['None', 'v1', 'v2.5', 'v2.6'] +inpaint_option_default = 'Inpaint or Outpaint (default)' +inpaint_option_detail = 'Improve Detail (face, hand, eyes, etc.)' +inpaint_option_modify = 'Modify Content (add objects, change background, etc.)' +inpaint_options = [inpaint_option_default, inpaint_option_detail, inpaint_option_modify] + +desc_type_photo = 'Photograph' +desc_type_anime = 'Art/Anime' + + +class MetadataScheme(Enum): + FOOOCUS = 'fooocus' + A1111 = 'a1111' + + +metadata_scheme = [ + (f'{MetadataScheme.FOOOCUS.value} (json)', MetadataScheme.FOOOCUS.value), + (f'{MetadataScheme.A1111.value} (plain text)', MetadataScheme.A1111.value), +] + +lora_count = 5 + +controlnet_image_count = 4 + + +class Steps(IntEnum): + QUALITY = 60 + SPEED = 30 + EXTREME_SPEED = 8 + + +class StepsUOV(IntEnum): + QUALITY = 36 + SPEED = 18 + EXTREME_SPEED = 8 + + +class Performance(Enum): + QUALITY = 'Quality' + SPEED = 'Speed' + EXTREME_SPEED = 'Extreme Speed' + + @classmethod + def list(cls) -> list: + return list(map(lambda c: c.value, cls)) + + def steps(self) -> int | None: + return Steps[self.name].value if Steps[self.name] else None + + def steps_uov(self) -> int | None: + return StepsUOV[self.name].value if Steps[self.name] else None + + +performance_selections = Performance.list() diff --git a/modules/gradio_hijack.py b/modules/gradio_hijack.py new file mode 100644 index 0000000000000000000000000000000000000000..181429ec39a0336ffa43ebf23e4fa2b87dd97674 --- /dev/null +++ b/modules/gradio_hijack.py @@ -0,0 +1,480 @@ +"""gr.Image() component.""" + +from __future__ import annotations + +import warnings +from pathlib import Path +from typing import Any, Literal + +import numpy as np +import PIL +import PIL.ImageOps +import gradio.routes +import importlib + +from gradio_client import utils as client_utils +from gradio_client.documentation import document, set_documentation_group +from gradio_client.serializing import ImgSerializable +from PIL import Image as _Image # using _ to minimize namespace pollution + +from gradio import processing_utils, utils +from gradio.components.base import IOComponent, _Keywords, Block +from gradio.deprecation import warn_style_method_deprecation +from gradio.events import ( + Changeable, + Clearable, + Editable, + EventListenerMethod, + Selectable, + Streamable, + Uploadable, +) +from gradio.interpretation import TokenInterpretable + +set_documentation_group("component") +_Image.init() # fixes https://github.com/gradio-app/gradio/issues/2843 + + +@document() +class Image( + Editable, + Clearable, + Changeable, + Streamable, + Selectable, + Uploadable, + IOComponent, + ImgSerializable, + TokenInterpretable, +): + """ + Creates an image component that can be used to upload/draw images (as an input) or display images (as an output). + Preprocessing: passes the uploaded image as a {numpy.array}, {PIL.Image} or {str} filepath depending on `type` -- unless `tool` is `sketch` AND source is one of `upload` or `webcam`. In these cases, a {dict} with keys `image` and `mask` is passed, and the format of the corresponding values depends on `type`. + Postprocessing: expects a {numpy.array}, {PIL.Image} or {str} or {pathlib.Path} filepath to an image and displays the image. + Examples-format: a {str} filepath to a local file that contains the image. + Demos: image_mod, image_mod_default_image + Guides: image-classification-in-pytorch, image-classification-in-tensorflow, image-classification-with-vision-transformers, building-a-pictionary_app, create-your-own-friends-with-a-gan + """ + + def __init__( + self, + value: str | _Image.Image | np.ndarray | None = None, + *, + shape: tuple[int, int] | None = None, + height: int | None = None, + width: int | None = None, + image_mode: Literal[ + "1", "L", "P", "RGB", "RGBA", "CMYK", "YCbCr", "LAB", "HSV", "I", "F" + ] = "RGB", + invert_colors: bool = False, + source: Literal["upload", "webcam", "canvas"] = "upload", + tool: Literal["editor", "select", "sketch", "color-sketch"] | None = None, + type: Literal["numpy", "pil", "filepath"] = "numpy", + label: str | None = None, + every: float | None = None, + show_label: bool | None = None, + show_download_button: bool = True, + container: bool = True, + scale: int | None = None, + min_width: int = 160, + interactive: bool | None = None, + visible: bool = True, + streaming: bool = False, + elem_id: str | None = None, + elem_classes: list[str] | str | None = None, + mirror_webcam: bool = True, + brush_radius: float | None = None, + brush_color: str = "#000000", + mask_opacity: float = 0.7, + show_share_button: bool | None = None, + **kwargs, + ): + """ + Parameters: + value: A PIL Image, numpy array, path or URL for the default value that Image component is going to take. If callable, the function will be called whenever the app loads to set the initial value of the component. + shape: (width, height) shape to crop and resize image when passed to function. If None, matches input image size. Pass None for either width or height to only crop and resize the other. + height: Height of the displayed image in pixels. + width: Width of the displayed image in pixels. + image_mode: "RGB" if color, or "L" if black and white. See https://pillow.readthedocs.io/en/stable/handbook/concepts.html for other supported image modes and their meaning. + invert_colors: whether to invert the image as a preprocessing step. + source: Source of image. "upload" creates a box where user can drop an image file, "webcam" allows user to take snapshot from their webcam, "canvas" defaults to a white image that can be edited and drawn upon with tools. + tool: Tools used for editing. "editor" allows a full screen editor (and is the default if source is "upload" or "webcam"), "select" provides a cropping and zoom tool, "sketch" allows you to create a binary sketch (and is the default if source="canvas"), and "color-sketch" allows you to created a sketch in different colors. "color-sketch" can be used with source="upload" or "webcam" to allow sketching on an image. "sketch" can also be used with "upload" or "webcam" to create a mask over an image and in that case both the image and mask are passed into the function as a dictionary with keys "image" and "mask" respectively. + type: The format the image is converted to before being passed into the prediction function. "numpy" converts the image to a numpy array with shape (height, width, 3) and values from 0 to 255, "pil" converts the image to a PIL image object, "filepath" passes a str path to a temporary file containing the image. + label: component name in interface. + every: If `value` is a callable, run the function 'every' number of seconds while the client connection is open. Has no effect otherwise. Queue must be enabled. The event can be accessed (e.g. to cancel it) via this component's .load_event attribute. + show_label: if True, will display label. + show_download_button: If True, will display button to download image. + container: If True, will place the component in a container - providing some extra padding around the border. + scale: relative width compared to adjacent Components in a Row. For example, if Component A has scale=2, and Component B has scale=1, A will be twice as wide as B. Should be an integer. + min_width: minimum pixel width, will wrap if not sufficient screen space to satisfy this value. If a certain scale value results in this Component being narrower than min_width, the min_width parameter will be respected first. + interactive: if True, will allow users to upload and edit an image; if False, can only be used to display images. If not provided, this is inferred based on whether the component is used as an input or output. + visible: If False, component will be hidden. + streaming: If True when used in a `live` interface, will automatically stream webcam feed. Only valid is source is 'webcam'. + elem_id: An optional string that is assigned as the id of this component in the HTML DOM. Can be used for targeting CSS styles. + elem_classes: An optional list of strings that are assigned as the classes of this component in the HTML DOM. Can be used for targeting CSS styles. + mirror_webcam: If True webcam will be mirrored. Default is True. + brush_radius: Size of the brush for Sketch. Default is None which chooses a sensible default + brush_color: Color of the brush for Sketch as hex string. Default is "#000000". + mask_opacity: Opacity of mask drawn on image, as a value between 0 and 1. + show_share_button: If True, will show a share icon in the corner of the component that allows user to share outputs to Hugging Face Spaces Discussions. If False, icon does not appear. If set to None (default behavior), then the icon appears if this Gradio app is launched on Spaces, but not otherwise. + """ + self.brush_radius = brush_radius + self.brush_color = brush_color + self.mask_opacity = mask_opacity + self.mirror_webcam = mirror_webcam + valid_types = ["numpy", "pil", "filepath"] + if type not in valid_types: + raise ValueError( + f"Invalid value for parameter `type`: {type}. Please choose from one of: {valid_types}" + ) + self.type = type + self.shape = shape + self.height = height + self.width = width + self.image_mode = image_mode + valid_sources = ["upload", "webcam", "canvas"] + if source not in valid_sources: + raise ValueError( + f"Invalid value for parameter `source`: {source}. Please choose from one of: {valid_sources}" + ) + self.source = source + if tool is None: + self.tool = "sketch" if source == "canvas" else "editor" + else: + self.tool = tool + self.invert_colors = invert_colors + self.streaming = streaming + self.show_download_button = show_download_button + if streaming and source != "webcam": + raise ValueError("Image streaming only available if source is 'webcam'.") + self.select: EventListenerMethod + """ + Event listener for when the user clicks on a pixel within the image. + Uses event data gradio.SelectData to carry `index` to refer to the [x, y] coordinates of the clicked pixel. + See EventData documentation on how to use this event data. + """ + self.show_share_button = ( + (utils.get_space() is not None) + if show_share_button is None + else show_share_button + ) + IOComponent.__init__( + self, + label=label, + every=every, + show_label=show_label, + container=container, + scale=scale, + min_width=min_width, + interactive=interactive, + visible=visible, + elem_id=elem_id, + elem_classes=elem_classes, + value=value, + **kwargs, + ) + TokenInterpretable.__init__(self) + + def get_config(self): + return { + "image_mode": self.image_mode, + "shape": self.shape, + "height": self.height, + "width": self.width, + "source": self.source, + "tool": self.tool, + "value": self.value, + "streaming": self.streaming, + "mirror_webcam": self.mirror_webcam, + "brush_radius": self.brush_radius, + "brush_color": self.brush_color, + "mask_opacity": self.mask_opacity, + "selectable": self.selectable, + "show_share_button": self.show_share_button, + "show_download_button": self.show_download_button, + **IOComponent.get_config(self), + } + + @staticmethod + def update( + value: Any | Literal[_Keywords.NO_VALUE] | None = _Keywords.NO_VALUE, + height: int | None = None, + width: int | None = None, + label: str | None = None, + show_label: bool | None = None, + show_download_button: bool | None = None, + container: bool | None = None, + scale: int | None = None, + min_width: int | None = None, + interactive: bool | None = None, + visible: bool | None = None, + brush_radius: float | None = None, + brush_color: str | None = None, + mask_opacity: float | None = None, + show_share_button: bool | None = None, + ): + return { + "height": height, + "width": width, + "label": label, + "show_label": show_label, + "show_download_button": show_download_button, + "container": container, + "scale": scale, + "min_width": min_width, + "interactive": interactive, + "visible": visible, + "value": value, + "brush_radius": brush_radius, + "brush_color": brush_color, + "mask_opacity": mask_opacity, + "show_share_button": show_share_button, + "__type__": "update", + } + + def _format_image( + self, im: _Image.Image | None + ) -> np.ndarray | _Image.Image | str | None: + """Helper method to format an image based on self.type""" + if im is None: + return im + fmt = im.format + if self.type == "pil": + return im + elif self.type == "numpy": + return np.array(im) + elif self.type == "filepath": + path = self.pil_to_temp_file( + im, dir=self.DEFAULT_TEMP_DIR, format=fmt or "png" + ) + self.temp_files.add(path) + return path + else: + raise ValueError( + "Unknown type: " + + str(self.type) + + ". Please choose from: 'numpy', 'pil', 'filepath'." + ) + + def preprocess( + self, x: str | dict[str, str] + ) -> np.ndarray | _Image.Image | str | dict | None: + """ + Parameters: + x: base64 url data, or (if tool == "sketch") a dict of image and mask base64 url data + Returns: + image in requested format, or (if tool == "sketch") a dict of image and mask in requested format + """ + if x is None: + return x + + mask = None + + if self.tool == "sketch" and self.source in ["upload", "webcam"]: + if isinstance(x, dict): + x, mask = x["image"], x["mask"] + + assert isinstance(x, str) + im = processing_utils.decode_base64_to_image(x) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + im = im.convert(self.image_mode) + if self.shape is not None: + im = processing_utils.resize_and_crop(im, self.shape) + if self.invert_colors: + im = PIL.ImageOps.invert(im) + if ( + self.source == "webcam" + and self.mirror_webcam is True + and self.tool != "color-sketch" + ): + im = PIL.ImageOps.mirror(im) + + if self.tool == "sketch" and self.source in ["upload", "webcam"]: + if mask is not None: + mask_im = processing_utils.decode_base64_to_image(mask) + if mask_im.mode == "RGBA": # whiten any opaque pixels in the mask + alpha_data = mask_im.getchannel("A").convert("L") + mask_im = _Image.merge("RGB", [alpha_data, alpha_data, alpha_data]) + return { + "image": self._format_image(im), + "mask": self._format_image(mask_im), + } + else: + return { + "image": self._format_image(im), + "mask": None, + } + + return self._format_image(im) + + def postprocess( + self, y: np.ndarray | _Image.Image | str | Path | None + ) -> str | None: + """ + Parameters: + y: image as a numpy array, PIL Image, string/Path filepath, or string URL + Returns: + base64 url data + """ + if y is None: + return None + if isinstance(y, np.ndarray): + return processing_utils.encode_array_to_base64(y) + elif isinstance(y, _Image.Image): + return processing_utils.encode_pil_to_base64(y) + elif isinstance(y, (str, Path)): + return client_utils.encode_url_or_file_to_base64(y) + else: + raise ValueError("Cannot process this value as an Image") + + def set_interpret_parameters(self, segments: int = 16): + """ + Calculates interpretation score of image subsections by splitting the image into subsections, then using a "leave one out" method to calculate the score of each subsection by whiting out the subsection and measuring the delta of the output value. + Parameters: + segments: Number of interpretation segments to split image into. + """ + self.interpretation_segments = segments + return self + + def _segment_by_slic(self, x): + """ + Helper method that segments an image into superpixels using slic. + Parameters: + x: base64 representation of an image + """ + x = processing_utils.decode_base64_to_image(x) + if self.shape is not None: + x = processing_utils.resize_and_crop(x, self.shape) + resized_and_cropped_image = np.array(x) + try: + from skimage.segmentation import slic + except (ImportError, ModuleNotFoundError) as err: + raise ValueError( + "Error: running this interpretation for images requires scikit-image, please install it first." + ) from err + try: + segments_slic = slic( + resized_and_cropped_image, + self.interpretation_segments, + compactness=10, + sigma=1, + start_label=1, + ) + except TypeError: # For skimage 0.16 and older + segments_slic = slic( + resized_and_cropped_image, + self.interpretation_segments, + compactness=10, + sigma=1, + ) + return segments_slic, resized_and_cropped_image + + def tokenize(self, x): + """ + Segments image into tokens, masks, and leave-one-out-tokens + Parameters: + x: base64 representation of an image + Returns: + tokens: list of tokens, used by the get_masked_input() method + leave_one_out_tokens: list of left-out tokens, used by the get_interpretation_neighbors() method + masks: list of masks, used by the get_interpretation_neighbors() method + """ + segments_slic, resized_and_cropped_image = self._segment_by_slic(x) + tokens, masks, leave_one_out_tokens = [], [], [] + replace_color = np.mean(resized_and_cropped_image, axis=(0, 1)) + for segment_value in np.unique(segments_slic): + mask = segments_slic == segment_value + image_screen = np.copy(resized_and_cropped_image) + image_screen[segments_slic == segment_value] = replace_color + leave_one_out_tokens.append( + processing_utils.encode_array_to_base64(image_screen) + ) + token = np.copy(resized_and_cropped_image) + token[segments_slic != segment_value] = 0 + tokens.append(token) + masks.append(mask) + return tokens, leave_one_out_tokens, masks + + def get_masked_inputs(self, tokens, binary_mask_matrix): + masked_inputs = [] + for binary_mask_vector in binary_mask_matrix: + masked_input = np.zeros_like(tokens[0], dtype=int) + for token, b in zip(tokens, binary_mask_vector): + masked_input = masked_input + token * int(b) + masked_inputs.append(processing_utils.encode_array_to_base64(masked_input)) + return masked_inputs + + def get_interpretation_scores( + self, x, neighbors, scores, masks, tokens=None, **kwargs + ) -> list[list[float]]: + """ + Returns: + A 2D array representing the interpretation score of each pixel of the image. + """ + x = processing_utils.decode_base64_to_image(x) + if self.shape is not None: + x = processing_utils.resize_and_crop(x, self.shape) + x = np.array(x) + output_scores = np.zeros((x.shape[0], x.shape[1])) + + for score, mask in zip(scores, masks): + output_scores += score * mask + + max_val, min_val = np.max(output_scores), np.min(output_scores) + if max_val > 0: + output_scores = (output_scores - min_val) / (max_val - min_val) + return output_scores.tolist() + + def style(self, *, height: int | None = None, width: int | None = None, **kwargs): + """ + This method is deprecated. Please set these arguments in the constructor instead. + """ + warn_style_method_deprecation() + if height is not None: + self.height = height + if width is not None: + self.width = width + return self + + def check_streamable(self): + if self.source != "webcam": + raise ValueError("Image streaming only available if source is 'webcam'.") + + def as_example(self, input_data: str | None) -> str: + if input_data is None: + return "" + elif ( + self.root_url + ): # If an externally hosted image, don't convert to absolute path + return input_data + return str(utils.abspath(input_data)) + + +all_components = [] + +if not hasattr(Block, 'original__init__'): + Block.original_init = Block.__init__ + + +def blk_ini(self, *args, **kwargs): + all_components.append(self) + return Block.original_init(self, *args, **kwargs) + + +Block.__init__ = blk_ini + + +gradio.routes.asyncio = importlib.reload(gradio.routes.asyncio) + +if not hasattr(gradio.routes.asyncio, 'original_wait_for'): + gradio.routes.asyncio.original_wait_for = gradio.routes.asyncio.wait_for + + +def patched_wait_for(fut, timeout): + del timeout + return gradio.routes.asyncio.original_wait_for(fut, timeout=65535) + + +gradio.routes.asyncio.wait_for = patched_wait_for + diff --git a/modules/html.py b/modules/html.py new file mode 100644 index 0000000000000000000000000000000000000000..769151a9ff86e460d69d3598fcac0481d59cf17b --- /dev/null +++ b/modules/html.py @@ -0,0 +1,146 @@ +css = ''' +.loader-container { + display: flex; /* Use flex to align items horizontally */ + align-items: center; /* Center items vertically within the container */ + white-space: nowrap; /* Prevent line breaks within the container */ +} + +.loader { + border: 8px solid #f3f3f3; /* Light grey */ + border-top: 8px solid #3498db; /* Blue */ + border-radius: 50%; + width: 30px; + height: 30px; + animation: spin 2s linear infinite; +} + +@keyframes spin { + 0% { transform: rotate(0deg); } + 100% { transform: rotate(360deg); } +} + +/* Style the progress bar */ +progress { + appearance: none; /* Remove default styling */ + height: 20px; /* Set the height of the progress bar */ + border-radius: 5px; /* Round the corners of the progress bar */ + background-color: #f3f3f3; /* Light grey background */ + width: 100%; +} + +/* Style the progress bar container */ +.progress-container { + margin-left: 20px; + margin-right: 20px; + flex-grow: 1; /* Allow the progress container to take up remaining space */ +} + +/* Set the color of the progress bar fill */ +progress::-webkit-progress-value { + background-color: #3498db; /* Blue color for the fill */ +} + +progress::-moz-progress-bar { + background-color: #3498db; /* Blue color for the fill in Firefox */ +} + +/* Style the text on the progress bar */ +progress::after { + content: attr(value '%'); /* Display the progress value followed by '%' */ + position: absolute; + top: 50%; + left: 50%; + transform: translate(-50%, -50%); + color: white; /* Set text color */ + font-size: 14px; /* Set font size */ +} + +/* Style other texts */ +.loader-container > span { + margin-left: 5px; /* Add spacing between the progress bar and the text */ +} + +.progress-bar > .generating { + display: none !important; +} + +.progress-bar{ + height: 30px !important; +} + +.type_row{ + height: 80px !important; +} + +.type_row_half{ + height: 32px !important; +} + +.scroll-hide{ + resize: none !important; +} + +.refresh_button{ + border: none !important; + background: none !important; + font-size: none !important; + box-shadow: none !important; +} + +.advanced_check_row{ + width: 250px !important; +} + +.min_check{ + min-width: min(1px, 100%) !important; +} + +.resizable_area { + resize: vertical; + overflow: auto !important; +} + +.aspect_ratios label { + width: 140px !important; +} + +.aspect_ratios label span { + white-space: nowrap !important; +} + +.aspect_ratios label input { + margin-left: -5px !important; +} + +.lora_enable label { + height: 100%; +} + +.lora_enable label input { + margin: auto; +} + +.lora_enable label span { + display: none; +} + +@-moz-document url-prefix() { + .lora_weight input[type=number] { + width: 80px; + } +} + +''' +progress_html = ''' + + ++''' + + +def make_progress_html(number, text): + return progress_html.replace('*number*', str(number)).replace('*text*', text) diff --git a/modules/inpaint_worker.py b/modules/inpaint_worker.py new file mode 100644 index 0000000000000000000000000000000000000000..43a7ae23e9bd2cebda69b94013bf1661bd8fd952 --- /dev/null +++ b/modules/inpaint_worker.py @@ -0,0 +1,264 @@ +import torch +import numpy as np + +from PIL import Image, ImageFilter +from modules.util import resample_image, set_image_shape_ceil, get_image_shape_ceil +from modules.upscaler import perform_upscale +import cv2 + + +inpaint_head_model = None + + +class InpaintHead(torch.nn.Module): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.head = torch.nn.Parameter(torch.empty(size=(320, 5, 3, 3), device='cpu')) + + def __call__(self, x): + x = torch.nn.functional.pad(x, (1, 1, 1, 1), "replicate") + return torch.nn.functional.conv2d(input=x, weight=self.head) + + +current_task = None + + +def box_blur(x, k): + x = Image.fromarray(x) + x = x.filter(ImageFilter.BoxBlur(k)) + return np.array(x) + + +def max_filter_opencv(x, ksize=3): + # Use OpenCV maximum filter + # Make sure the input type is int16 + return cv2.dilate(x, np.ones((ksize, ksize), dtype=np.int16)) + + +def morphological_open(x): + # Convert array to int16 type via threshold operation + x_int16 = np.zeros_like(x, dtype=np.int16) + x_int16[x > 127] = 256 + + for i in range(32): + # Use int16 type to avoid overflow + maxed = max_filter_opencv(x_int16, ksize=3) - 8 + x_int16 = np.maximum(maxed, x_int16) + + # Clip negative values to 0 and convert back to uint8 type + x_uint8 = np.clip(x_int16, 0, 255).astype(np.uint8) + return x_uint8 + + +def up255(x, t=0): + y = np.zeros_like(x).astype(np.uint8) + y[x > t] = 255 + return y + + +def imsave(x, path): + x = Image.fromarray(x) + x.save(path) + + +def regulate_abcd(x, a, b, c, d): + H, W = x.shape[:2] + if a < 0: + a = 0 + if a > H: + a = H + if b < 0: + b = 0 + if b > H: + b = H + if c < 0: + c = 0 + if c > W: + c = W + if d < 0: + d = 0 + if d > W: + d = W + return int(a), int(b), int(c), int(d) + + +def compute_initial_abcd(x): + indices = np.where(x) + a = np.min(indices[0]) + b = np.max(indices[0]) + c = np.min(indices[1]) + d = np.max(indices[1]) + abp = (b + a) // 2 + abm = (b - a) // 2 + cdp = (d + c) // 2 + cdm = (d - c) // 2 + l = int(max(abm, cdm) * 1.15) + a = abp - l + b = abp + l + 1 + c = cdp - l + d = cdp + l + 1 + a, b, c, d = regulate_abcd(x, a, b, c, d) + return a, b, c, d + + +def solve_abcd(x, a, b, c, d, k): + k = float(k) + assert 0.0 <= k <= 1.0 + + H, W = x.shape[:2] + if k == 1.0: + return 0, H, 0, W + while True: + if b - a >= H * k and d - c >= W * k: + break + + add_h = (b - a) < (d - c) + add_w = not add_h + + if b - a == H: + add_w = True + + if d - c == W: + add_h = True + + if add_h: + a -= 1 + b += 1 + + if add_w: + c -= 1 + d += 1 + + a, b, c, d = regulate_abcd(x, a, b, c, d) + return a, b, c, d + + +def fooocus_fill(image, mask): + current_image = image.copy() + raw_image = image.copy() + area = np.where(mask < 127) + store = raw_image[area] + + for k, repeats in [(512, 2), (256, 2), (128, 4), (64, 4), (33, 8), (15, 8), (5, 16), (3, 16)]: + for _ in range(repeats): + current_image = box_blur(current_image, k) + current_image[area] = store + + return current_image + + +class InpaintWorker: + def __init__(self, image, mask, use_fill=True, k=0.618): + a, b, c, d = compute_initial_abcd(mask > 0) + a, b, c, d = solve_abcd(mask, a, b, c, d, k=k) + + # interested area + self.interested_area = (a, b, c, d) + self.interested_mask = mask[a:b, c:d] + self.interested_image = image[a:b, c:d] + + # super resolution + if get_image_shape_ceil(self.interested_image) < 1024: + self.interested_image = perform_upscale(self.interested_image) + + # resize to make images ready for diffusion + self.interested_image = set_image_shape_ceil(self.interested_image, 1024) + self.interested_fill = self.interested_image.copy() + H, W, C = self.interested_image.shape + + # process mask + self.interested_mask = up255(resample_image(self.interested_mask, W, H), t=127) + + # compute filling + if use_fill: + self.interested_fill = fooocus_fill(self.interested_image, self.interested_mask) + + # soft pixels + self.mask = morphological_open(mask) + self.image = image + + # ending + self.latent = None + self.latent_after_swap = None + self.swapped = False + self.latent_mask = None + self.inpaint_head_feature = None + return + + def load_latent(self, latent_fill, latent_mask, latent_swap=None): + self.latent = latent_fill + self.latent_mask = latent_mask + self.latent_after_swap = latent_swap + return + + def patch(self, inpaint_head_model_path, inpaint_latent, inpaint_latent_mask, model): + global inpaint_head_model + + if inpaint_head_model is None: + inpaint_head_model = InpaintHead() + sd = torch.load(inpaint_head_model_path, map_location='cpu') + inpaint_head_model.load_state_dict(sd) + + feed = torch.cat([ + inpaint_latent_mask, + model.model.process_latent_in(inpaint_latent) + ], dim=1) + + inpaint_head_model.to(device=feed.device, dtype=feed.dtype) + inpaint_head_feature = inpaint_head_model(feed) + + def input_block_patch(h, transformer_options): + if transformer_options["block"][1] == 0: + h = h + inpaint_head_feature.to(h) + return h + + m = model.clone() + m.set_model_input_block_patch(input_block_patch) + return m + + def swap(self): + if self.swapped: + return + + if self.latent is None: + return + + if self.latent_after_swap is None: + return + + self.latent, self.latent_after_swap = self.latent_after_swap, self.latent + self.swapped = True + return + + def unswap(self): + if not self.swapped: + return + + if self.latent is None: + return + + if self.latent_after_swap is None: + return + + self.latent, self.latent_after_swap = self.latent_after_swap, self.latent + self.swapped = False + return + + def color_correction(self, img): + fg = img.astype(np.float32) + bg = self.image.copy().astype(np.float32) + w = self.mask[:, :, None].astype(np.float32) / 255.0 + y = fg * w + bg * (1 - w) + return y.clip(0, 255).astype(np.uint8) + + def post_process(self, img): + a, b, c, d = self.interested_area + content = resample_image(img, d - c, b - a) + result = self.image.copy() + result[a:b, c:d] = content + result = self.color_correction(result) + return result + + def visualize_mask_processing(self): + return [self.interested_fill, self.interested_mask, self.interested_image] + diff --git a/modules/launch_util.py b/modules/launch_util.py new file mode 100644 index 0000000000000000000000000000000000000000..b483d5158ca5eeeff6f385b1a94990f9e5f6e871 --- /dev/null +++ b/modules/launch_util.py @@ -0,0 +1,103 @@ +import os +import importlib +import importlib.util +import subprocess +import sys +import re +import logging +import importlib.metadata +import packaging.version +from packaging.requirements import Requirement + + + + +logging.getLogger("torch.distributed.nn").setLevel(logging.ERROR) # sshh... +logging.getLogger("xformers").addFilter(lambda record: 'A matching Triton is not available' not in record.getMessage()) + +re_requirement = re.compile(r"\s*([-\w]+)\s*(?:==\s*([-+.\w]+))?\s*") + +python = sys.executable +default_command_live = (os.environ.get('LAUNCH_LIVE_OUTPUT') == "1") +index_url = os.environ.get('INDEX_URL', "") + +modules_path = os.path.dirname(os.path.realpath(__file__)) +script_path = os.path.dirname(modules_path) + + +def is_installed(package): + try: + spec = importlib.util.find_spec(package) + except ModuleNotFoundError: + return False + + return spec is not None + + +def run(command, desc=None, errdesc=None, custom_env=None, live: bool = default_command_live) -> str: + if desc is not None: + print(desc) + + run_kwargs = { + "args": command, + "shell": True, + "env": os.environ if custom_env is None else custom_env, + "encoding": 'utf8', + "errors": 'ignore', + } + + if not live: + run_kwargs["stdout"] = run_kwargs["stderr"] = subprocess.PIPE + + result = subprocess.run(**run_kwargs) + + if result.returncode != 0: + error_bits = [ + f"{errdesc or 'Error running command'}.", + f"Command: {command}", + f"Error code: {result.returncode}", + ] + if result.stdout: + error_bits.append(f"stdout: {result.stdout}") + if result.stderr: + error_bits.append(f"stderr: {result.stderr}") + raise RuntimeError("\n".join(error_bits)) + + return (result.stdout or "") + + +def run_pip(command, desc=None, live=default_command_live): + try: + index_url_line = f' --index-url {index_url}' if index_url != '' else '' + return run(f'"{python}" -m pip {command} --prefer-binary{index_url_line}', desc=f"Installing {desc}", + errdesc=f"Couldn't install {desc}", live=live) + except Exception as e: + print(e) + print(f'CMD Failed {desc}: {command}') + return None + + +def requirements_met(requirements_file): + with open(requirements_file, "r", encoding="utf8") as file: + for line in file: + line = line.strip() + if line == "" or line.startswith('#'): + continue + + requirement = Requirement(line) + package = requirement.name + + try: + version_installed = importlib.metadata.version(package) + installed_version = packaging.version.parse(version_installed) + + # Check if the installed version satisfies the requirement + if installed_version not in requirement.specifier: + print(f"Version mismatch for {package}: Installed version {version_installed} does not meet requirement {requirement}") + return False + except Exception as e: + print(f"Error checking version for {package}: {e}") + return False + + return True + diff --git a/modules/localization.py b/modules/localization.py new file mode 100644 index 0000000000000000000000000000000000000000..b21d4a564d134ac0be00d83c7005627d601d206e --- /dev/null +++ b/modules/localization.py @@ -0,0 +1,60 @@ +import json +import os + + +current_translation = {} +localization_root = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'language') + + +def localization_js(filename): + global current_translation + + if isinstance(filename, str): + full_name = os.path.abspath(os.path.join(localization_root, filename + '.json')) + if os.path.exists(full_name): + try: + with open(full_name, encoding='utf-8') as f: + current_translation = json.load(f) + assert isinstance(current_translation, dict) + for k, v in current_translation.items(): + assert isinstance(k, str) + assert isinstance(v, str) + except Exception as e: + print(str(e)) + print(f'Failed to load localization file {full_name}') + + # current_translation = {k: 'XXX' for k in current_translation.keys()} # use this to see if all texts are covered + + return f"window.localization = {json.dumps(current_translation)}" + + +def dump_english_config(components): + all_texts = [] + for c in components: + label = getattr(c, 'label', None) + value = getattr(c, 'value', None) + choices = getattr(c, 'choices', None) + info = getattr(c, 'info', None) + + if isinstance(label, str): + all_texts.append(label) + if isinstance(value, str): + all_texts.append(value) + if isinstance(info, str): + all_texts.append(info) + if isinstance(choices, list): + for x in choices: + if isinstance(x, str): + all_texts.append(x) + if isinstance(x, tuple): + for y in x: + if isinstance(y, str): + all_texts.append(y) + + config_dict = {k: k for k in all_texts if k != "" and 'progress-container' not in k} + full_name = os.path.abspath(os.path.join(localization_root, 'en.json')) + + with open(full_name, "w", encoding="utf-8") as json_file: + json.dump(config_dict, json_file, indent=4) + + return diff --git a/modules/lora.py b/modules/lora.py new file mode 100644 index 0000000000000000000000000000000000000000..088545c708943aa8e51e8b2bfd32b2a9968b639f --- /dev/null +++ b/modules/lora.py @@ -0,0 +1,152 @@ +def match_lora(lora, to_load): + patch_dict = {} + loaded_keys = set() + for x in to_load: + real_load_key = to_load[x] + if real_load_key in lora: + patch_dict[real_load_key] = ('fooocus', lora[real_load_key]) + loaded_keys.add(real_load_key) + continue + + alpha_name = "{}.alpha".format(x) + alpha = None + if alpha_name in lora.keys(): + alpha = lora[alpha_name].item() + loaded_keys.add(alpha_name) + + regular_lora = "{}.lora_up.weight".format(x) + diffusers_lora = "{}_lora.up.weight".format(x) + transformers_lora = "{}.lora_linear_layer.up.weight".format(x) + A_name = None + + if regular_lora in lora.keys(): + A_name = regular_lora + B_name = "{}.lora_down.weight".format(x) + mid_name = "{}.lora_mid.weight".format(x) + elif diffusers_lora in lora.keys(): + A_name = diffusers_lora + B_name = "{}_lora.down.weight".format(x) + mid_name = None + elif transformers_lora in lora.keys(): + A_name = transformers_lora + B_name ="{}.lora_linear_layer.down.weight".format(x) + mid_name = None + + if A_name is not None: + mid = None + if mid_name is not None and mid_name in lora.keys(): + mid = lora[mid_name] + loaded_keys.add(mid_name) + patch_dict[to_load[x]] = ("lora", (lora[A_name], lora[B_name], alpha, mid)) + loaded_keys.add(A_name) + loaded_keys.add(B_name) + + + ######## loha + hada_w1_a_name = "{}.hada_w1_a".format(x) + hada_w1_b_name = "{}.hada_w1_b".format(x) + hada_w2_a_name = "{}.hada_w2_a".format(x) + hada_w2_b_name = "{}.hada_w2_b".format(x) + hada_t1_name = "{}.hada_t1".format(x) + hada_t2_name = "{}.hada_t2".format(x) + if hada_w1_a_name in lora.keys(): + hada_t1 = None + hada_t2 = None + if hada_t1_name in lora.keys(): + hada_t1 = lora[hada_t1_name] + hada_t2 = lora[hada_t2_name] + loaded_keys.add(hada_t1_name) + loaded_keys.add(hada_t2_name) + + patch_dict[to_load[x]] = ("loha", (lora[hada_w1_a_name], lora[hada_w1_b_name], alpha, lora[hada_w2_a_name], lora[hada_w2_b_name], hada_t1, hada_t2)) + loaded_keys.add(hada_w1_a_name) + loaded_keys.add(hada_w1_b_name) + loaded_keys.add(hada_w2_a_name) + loaded_keys.add(hada_w2_b_name) + + + ######## lokr + lokr_w1_name = "{}.lokr_w1".format(x) + lokr_w2_name = "{}.lokr_w2".format(x) + lokr_w1_a_name = "{}.lokr_w1_a".format(x) + lokr_w1_b_name = "{}.lokr_w1_b".format(x) + lokr_t2_name = "{}.lokr_t2".format(x) + lokr_w2_a_name = "{}.lokr_w2_a".format(x) + lokr_w2_b_name = "{}.lokr_w2_b".format(x) + + lokr_w1 = None + if lokr_w1_name in lora.keys(): + lokr_w1 = lora[lokr_w1_name] + loaded_keys.add(lokr_w1_name) + + lokr_w2 = None + if lokr_w2_name in lora.keys(): + lokr_w2 = lora[lokr_w2_name] + loaded_keys.add(lokr_w2_name) + + lokr_w1_a = None + if lokr_w1_a_name in lora.keys(): + lokr_w1_a = lora[lokr_w1_a_name] + loaded_keys.add(lokr_w1_a_name) + + lokr_w1_b = None + if lokr_w1_b_name in lora.keys(): + lokr_w1_b = lora[lokr_w1_b_name] + loaded_keys.add(lokr_w1_b_name) + + lokr_w2_a = None + if lokr_w2_a_name in lora.keys(): + lokr_w2_a = lora[lokr_w2_a_name] + loaded_keys.add(lokr_w2_a_name) + + lokr_w2_b = None + if lokr_w2_b_name in lora.keys(): + lokr_w2_b = lora[lokr_w2_b_name] + loaded_keys.add(lokr_w2_b_name) + + lokr_t2 = None + if lokr_t2_name in lora.keys(): + lokr_t2 = lora[lokr_t2_name] + loaded_keys.add(lokr_t2_name) + + if (lokr_w1 is not None) or (lokr_w2 is not None) or (lokr_w1_a is not None) or (lokr_w2_a is not None): + patch_dict[to_load[x]] = ("lokr", (lokr_w1, lokr_w2, alpha, lokr_w1_a, lokr_w1_b, lokr_w2_a, lokr_w2_b, lokr_t2)) + + #glora + a1_name = "{}.a1.weight".format(x) + a2_name = "{}.a2.weight".format(x) + b1_name = "{}.b1.weight".format(x) + b2_name = "{}.b2.weight".format(x) + if a1_name in lora: + patch_dict[to_load[x]] = ("glora", (lora[a1_name], lora[a2_name], lora[b1_name], lora[b2_name], alpha)) + loaded_keys.add(a1_name) + loaded_keys.add(a2_name) + loaded_keys.add(b1_name) + loaded_keys.add(b2_name) + + w_norm_name = "{}.w_norm".format(x) + b_norm_name = "{}.b_norm".format(x) + w_norm = lora.get(w_norm_name, None) + b_norm = lora.get(b_norm_name, None) + + if w_norm is not None: + loaded_keys.add(w_norm_name) + patch_dict[to_load[x]] = ("diff", (w_norm,)) + if b_norm is not None: + loaded_keys.add(b_norm_name) + patch_dict["{}.bias".format(to_load[x][:-len(".weight")])] = ("diff", (b_norm,)) + + diff_name = "{}.diff".format(x) + diff_weight = lora.get(diff_name, None) + if diff_weight is not None: + patch_dict[to_load[x]] = ("diff", (diff_weight,)) + loaded_keys.add(diff_name) + + diff_bias_name = "{}.diff_b".format(x) + diff_bias = lora.get(diff_bias_name, None) + if diff_bias is not None: + patch_dict["{}.bias".format(to_load[x][:-len(".weight")])] = ("diff", (diff_bias,)) + loaded_keys.add(diff_bias_name) + + remaining_dict = {x: y for x, y in lora.items() if x not in loaded_keys} + return patch_dict, remaining_dict diff --git a/modules/meta_parser.py b/modules/meta_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..546c093fa008da831fb17b0e6a2cc256467315b2 --- /dev/null +++ b/modules/meta_parser.py @@ -0,0 +1,573 @@ +import json +import os +import re +from abc import ABC, abstractmethod +from pathlib import Path + +import gradio as gr +from PIL import Image + +import fooocus_version +import modules.config +import modules.sdxl_styles +from modules.flags import MetadataScheme, Performance, Steps +from modules.flags import SAMPLERS, CIVITAI_NO_KARRAS +from modules.util import quote, unquote, extract_styles_from_prompt, is_json, get_file_from_folder_list, calculate_sha256 + +re_param_code = r'\s*(\w[\w \-/]+):\s*("(?:\\.|[^\\"])+"|[^,]*)(?:,|$)' +re_param = re.compile(re_param_code) +re_imagesize = re.compile(r"^(\d+)x(\d+)$") + +hash_cache = {} + + +def load_parameter_button_click(raw_metadata: dict | str, is_generating: bool): + loaded_parameter_dict = raw_metadata + if isinstance(raw_metadata, str): + loaded_parameter_dict = json.loads(raw_metadata) + assert isinstance(loaded_parameter_dict, dict) + + results = [len(loaded_parameter_dict) > 0, 1] + + get_str('prompt', 'Prompt', loaded_parameter_dict, results) + get_str('negative_prompt', 'Negative Prompt', loaded_parameter_dict, results) + get_list('styles', 'Styles', loaded_parameter_dict, results) + get_str('performance', 'Performance', loaded_parameter_dict, results) + get_steps('steps', 'Steps', loaded_parameter_dict, results) + get_float('overwrite_switch', 'Overwrite Switch', loaded_parameter_dict, results) + get_resolution('resolution', 'Resolution', loaded_parameter_dict, results) + get_float('guidance_scale', 'Guidance Scale', loaded_parameter_dict, results) + get_float('sharpness', 'Sharpness', loaded_parameter_dict, results) + get_adm_guidance('adm_guidance', 'ADM Guidance', loaded_parameter_dict, results) + get_str('refiner_swap_method', 'Refiner Swap Method', loaded_parameter_dict, results) + get_float('adaptive_cfg', 'CFG Mimicking from TSNR', loaded_parameter_dict, results) + get_str('base_model', 'Base Model', loaded_parameter_dict, results) + get_str('refiner_model', 'Refiner Model', loaded_parameter_dict, results) + get_float('refiner_switch', 'Refiner Switch', loaded_parameter_dict, results) + get_str('sampler', 'Sampler', loaded_parameter_dict, results) + get_str('scheduler', 'Scheduler', loaded_parameter_dict, results) + get_seed('seed', 'Seed', loaded_parameter_dict, results) + + if is_generating: + results.append(gr.update()) + else: + results.append(gr.update(visible=True)) + + results.append(gr.update(visible=False)) + + get_freeu('freeu', 'FreeU', loaded_parameter_dict, results) + + for i in range(modules.config.default_max_lora_number): + get_lora(f'lora_combined_{i + 1}', f'LoRA {i + 1}', loaded_parameter_dict, results) + + return results + + +def get_str(key: str, fallback: str | None, source_dict: dict, results: list, default=None): + try: + h = source_dict.get(key, source_dict.get(fallback, default)) + assert isinstance(h, str) + results.append(h) + except: + results.append(gr.update()) + + +def get_list(key: str, fallback: str | None, source_dict: dict, results: list, default=None): + try: + h = source_dict.get(key, source_dict.get(fallback, default)) + h = eval(h) + assert isinstance(h, list) + results.append(h) + except: + results.append(gr.update()) + + +def get_float(key: str, fallback: str | None, source_dict: dict, results: list, default=None): + try: + h = source_dict.get(key, source_dict.get(fallback, default)) + assert h is not None + h = float(h) + results.append(h) + except: + results.append(gr.update()) + + +def get_steps(key: str, fallback: str | None, source_dict: dict, results: list, default=None): + try: + h = source_dict.get(key, source_dict.get(fallback, default)) + assert h is not None + h = int(h) + # if not in steps or in steps and performance is not the same + if h not in iter(Steps) or Steps(h).name.casefold() != source_dict.get('performance', '').replace(' ', '_').casefold(): + results.append(h) + return + results.append(-1) + except: + results.append(-1) + + +def get_resolution(key: str, fallback: str | None, source_dict: dict, results: list, default=None): + try: + h = source_dict.get(key, source_dict.get(fallback, default)) + width, height = eval(h) + formatted = modules.config.add_ratio(f'{width}*{height}') + if formatted in modules.config.available_aspect_ratios: + results.append(formatted) + results.append(-1) + results.append(-1) + else: + results.append(gr.update()) + results.append(int(width)) + results.append(int(height)) + except: + results.append(gr.update()) + results.append(gr.update()) + results.append(gr.update()) + + +def get_seed(key: str, fallback: str | None, source_dict: dict, results: list, default=None): + try: + h = source_dict.get(key, source_dict.get(fallback, default)) + assert h is not None + h = int(h) + results.append(False) + results.append(h) + except: + results.append(gr.update()) + results.append(gr.update()) + + +def get_adm_guidance(key: str, fallback: str | None, source_dict: dict, results: list, default=None): + try: + h = source_dict.get(key, source_dict.get(fallback, default)) + p, n, e = eval(h) + results.append(float(p)) + results.append(float(n)) + results.append(float(e)) + except: + results.append(gr.update()) + results.append(gr.update()) + results.append(gr.update()) + + +def get_freeu(key: str, fallback: str | None, source_dict: dict, results: list, default=None): + try: + h = source_dict.get(key, source_dict.get(fallback, default)) + b1, b2, s1, s2 = eval(h) + results.append(True) + results.append(float(b1)) + results.append(float(b2)) + results.append(float(s1)) + results.append(float(s2)) + except: + results.append(False) + results.append(gr.update()) + results.append(gr.update()) + results.append(gr.update()) + results.append(gr.update()) + + +def get_lora(key: str, fallback: str | None, source_dict: dict, results: list): + try: + n, w = source_dict.get(key, source_dict.get(fallback)).split(' : ') + w = float(w) + results.append(True) + results.append(n) + results.append(w) + except: + results.append(True) + results.append('None') + results.append(1) + + +def get_sha256(filepath): + global hash_cache + if filepath not in hash_cache: + hash_cache[filepath] = calculate_sha256(filepath) + + return hash_cache[filepath] + + +def parse_meta_from_preset(preset_content): + assert isinstance(preset_content, dict) + preset_prepared = {} + items = preset_content + + for settings_key, meta_key in modules.config.possible_preset_keys.items(): + if settings_key == "default_loras": + loras = getattr(modules.config, settings_key) + if settings_key in items: + loras = items[settings_key] + for index, lora in enumerate(loras[:5]): + preset_prepared[f'lora_combined_{index + 1}'] = ' : '.join(map(str, lora)) + elif settings_key == "default_aspect_ratio": + if settings_key in items and items[settings_key] is not None: + default_aspect_ratio = items[settings_key] + width, height = default_aspect_ratio.split('*') + else: + default_aspect_ratio = getattr(modules.config, settings_key) + width, height = default_aspect_ratio.split('×') + height = height[:height.index(" ")] + preset_prepared[meta_key] = (width, height) + else: + preset_prepared[meta_key] = items[settings_key] if settings_key in items and items[ + settings_key] is not None else getattr(modules.config, settings_key) + + if settings_key == "default_styles" or settings_key == "default_aspect_ratio": + preset_prepared[meta_key] = str(preset_prepared[meta_key]) + + return preset_prepared + + +class MetadataParser(ABC): + def __init__(self): + self.raw_prompt: str = '' + self.full_prompt: str = '' + self.raw_negative_prompt: str = '' + self.full_negative_prompt: str = '' + self.steps: int = 30 + self.base_model_name: str = '' + self.base_model_hash: str = '' + self.refiner_model_name: str = '' + self.refiner_model_hash: str = '' + self.loras: list = [] + + @abstractmethod + def get_scheme(self) -> MetadataScheme: + raise NotImplementedError + + @abstractmethod + def parse_json(self, metadata: dict | str) -> dict: + raise NotImplementedError + + @abstractmethod + def parse_string(self, metadata: dict) -> str: + raise NotImplementedError + + def set_data(self, raw_prompt, full_prompt, raw_negative_prompt, full_negative_prompt, steps, base_model_name, + refiner_model_name, loras): + self.raw_prompt = raw_prompt + self.full_prompt = full_prompt + self.raw_negative_prompt = raw_negative_prompt + self.full_negative_prompt = full_negative_prompt + self.steps = steps + self.base_model_name = Path(base_model_name).stem + + base_model_path = get_file_from_folder_list(base_model_name, modules.config.paths_checkpoints) + self.base_model_hash = get_sha256(base_model_path) + + if refiner_model_name not in ['', 'None']: + self.refiner_model_name = Path(refiner_model_name).stem + refiner_model_path = get_file_from_folder_list(refiner_model_name, modules.config.paths_checkpoints) + self.refiner_model_hash = get_sha256(refiner_model_path) + + self.loras = [] + for (lora_name, lora_weight) in loras: + if lora_name != 'None': + lora_path = get_file_from_folder_list(lora_name, modules.config.paths_loras) + lora_hash = get_sha256(lora_path) + self.loras.append((Path(lora_name).stem, lora_weight, lora_hash)) + + +class A1111MetadataParser(MetadataParser): + def get_scheme(self) -> MetadataScheme: + return MetadataScheme.A1111 + + fooocus_to_a1111 = { + 'raw_prompt': 'Raw prompt', + 'raw_negative_prompt': 'Raw negative prompt', + 'negative_prompt': 'Negative prompt', + 'styles': 'Styles', + 'performance': 'Performance', + 'steps': 'Steps', + 'sampler': 'Sampler', + 'scheduler': 'Scheduler', + 'guidance_scale': 'CFG scale', + 'seed': 'Seed', + 'resolution': 'Size', + 'sharpness': 'Sharpness', + 'adm_guidance': 'ADM Guidance', + 'refiner_swap_method': 'Refiner Swap Method', + 'adaptive_cfg': 'Adaptive CFG', + 'overwrite_switch': 'Overwrite Switch', + 'freeu': 'FreeU', + 'base_model': 'Model', + 'base_model_hash': 'Model hash', + 'refiner_model': 'Refiner', + 'refiner_model_hash': 'Refiner hash', + 'lora_hashes': 'Lora hashes', + 'lora_weights': 'Lora weights', + 'created_by': 'User', + 'version': 'Version' + } + + def parse_json(self, metadata: str) -> dict: + metadata_prompt = '' + metadata_negative_prompt = '' + + done_with_prompt = False + + *lines, lastline = metadata.strip().split("\n") + if len(re_param.findall(lastline)) < 3: + lines.append(lastline) + lastline = '' + + for line in lines: + line = line.strip() + if line.startswith(f"{self.fooocus_to_a1111['negative_prompt']}:"): + done_with_prompt = True + line = line[len(f"{self.fooocus_to_a1111['negative_prompt']}:"):].strip() + if done_with_prompt: + metadata_negative_prompt += ('' if metadata_negative_prompt == '' else "\n") + line + else: + metadata_prompt += ('' if metadata_prompt == '' else "\n") + line + + found_styles, prompt, negative_prompt = extract_styles_from_prompt(metadata_prompt, metadata_negative_prompt) + + data = { + 'prompt': prompt, + 'negative_prompt': negative_prompt + } + + for k, v in re_param.findall(lastline): + try: + if v != '' and v[0] == '"' and v[-1] == '"': + v = unquote(v) + + m = re_imagesize.match(v) + if m is not None: + data['resolution'] = str((m.group(1), m.group(2))) + else: + data[list(self.fooocus_to_a1111.keys())[list(self.fooocus_to_a1111.values()).index(k)]] = v + except Exception: + print(f"Error parsing \"{k}: {v}\"") + + # workaround for multiline prompts + if 'raw_prompt' in data: + data['prompt'] = data['raw_prompt'] + raw_prompt = data['raw_prompt'].replace("\n", ', ') + if metadata_prompt != raw_prompt and modules.sdxl_styles.fooocus_expansion not in found_styles: + found_styles.append(modules.sdxl_styles.fooocus_expansion) + + if 'raw_negative_prompt' in data: + data['negative_prompt'] = data['raw_negative_prompt'] + + data['styles'] = str(found_styles) + + # try to load performance based on steps, fallback for direct A1111 imports + if 'steps' in data and 'performance' not in data: + try: + data['performance'] = Performance[Steps(int(data['steps'])).name].value + except ValueError | KeyError: + pass + + if 'sampler' in data: + data['sampler'] = data['sampler'].replace(' Karras', '') + # get key + for k, v in SAMPLERS.items(): + if v == data['sampler']: + data['sampler'] = k + break + + for key in ['base_model', 'refiner_model']: + if key in data: + for filename in modules.config.model_filenames: + path = Path(filename) + if data[key] == path.stem: + data[key] = filename + break + + if 'lora_hashes' in data: + lora_filenames = modules.config.lora_filenames.copy() + if modules.config.sdxl_lcm_lora in lora_filenames: + lora_filenames.remove(modules.config.sdxl_lcm_lora) + for li, lora in enumerate(data['lora_hashes'].split(', ')): + lora_name, lora_hash, lora_weight = lora.split(': ') + for filename in lora_filenames: + path = Path(filename) + if lora_name == path.stem: + data[f'lora_combined_{li + 1}'] = f'{filename} : {lora_weight}' + break + + return data + + def parse_string(self, metadata: dict) -> str: + data = {k: v for _, k, v in metadata} + + width, height = eval(data['resolution']) + + sampler = data['sampler'] + scheduler = data['scheduler'] + if sampler in SAMPLERS and SAMPLERS[sampler] != '': + sampler = SAMPLERS[sampler] + if sampler not in CIVITAI_NO_KARRAS and scheduler == 'karras': + sampler += f' Karras' + + generation_params = { + self.fooocus_to_a1111['steps']: self.steps, + self.fooocus_to_a1111['sampler']: sampler, + self.fooocus_to_a1111['seed']: data['seed'], + self.fooocus_to_a1111['resolution']: f'{width}x{height}', + self.fooocus_to_a1111['guidance_scale']: data['guidance_scale'], + self.fooocus_to_a1111['sharpness']: data['sharpness'], + self.fooocus_to_a1111['adm_guidance']: data['adm_guidance'], + self.fooocus_to_a1111['base_model']: Path(data['base_model']).stem, + self.fooocus_to_a1111['base_model_hash']: self.base_model_hash, + + self.fooocus_to_a1111['performance']: data['performance'], + self.fooocus_to_a1111['scheduler']: scheduler, + # workaround for multiline prompts + self.fooocus_to_a1111['raw_prompt']: self.raw_prompt, + self.fooocus_to_a1111['raw_negative_prompt']: self.raw_negative_prompt, + } + + if self.refiner_model_name not in ['', 'None']: + generation_params |= { + self.fooocus_to_a1111['refiner_model']: self.refiner_model_name, + self.fooocus_to_a1111['refiner_model_hash']: self.refiner_model_hash + } + + for key in ['adaptive_cfg', 'overwrite_switch', 'refiner_swap_method', 'freeu']: + if key in data: + generation_params[self.fooocus_to_a1111[key]] = data[key] + + lora_hashes = [] + for index, (lora_name, lora_weight, lora_hash) in enumerate(self.loras): + # workaround for Fooocus not knowing LoRA name in LoRA metadata + lora_hashes.append(f'{lora_name}: {lora_hash}: {lora_weight}') + lora_hashes_string = ', '.join(lora_hashes) + + generation_params |= { + self.fooocus_to_a1111['lora_hashes']: lora_hashes_string, + self.fooocus_to_a1111['version']: data['version'] + } + + if modules.config.metadata_created_by != '': + generation_params[self.fooocus_to_a1111['created_by']] = modules.config.metadata_created_by + + generation_params_text = ", ".join( + [k if k == v else f'{k}: {quote(v)}' for k, v in generation_params.items() if + v is not None]) + positive_prompt_resolved = ', '.join(self.full_prompt) + negative_prompt_resolved = ', '.join(self.full_negative_prompt) + negative_prompt_text = f"\nNegative prompt: {negative_prompt_resolved}" if negative_prompt_resolved else "" + return f"{positive_prompt_resolved}{negative_prompt_text}\n{generation_params_text}".strip() + + +class FooocusMetadataParser(MetadataParser): + def get_scheme(self) -> MetadataScheme: + return MetadataScheme.FOOOCUS + + def parse_json(self, metadata: dict) -> dict: + model_filenames = modules.config.model_filenames.copy() + lora_filenames = modules.config.lora_filenames.copy() + if modules.config.sdxl_lcm_lora in lora_filenames: + lora_filenames.remove(modules.config.sdxl_lcm_lora) + + for key, value in metadata.items(): + if value in ['', 'None']: + continue + if key in ['base_model', 'refiner_model']: + metadata[key] = self.replace_value_with_filename(key, value, model_filenames) + elif key.startswith('lora_combined_'): + metadata[key] = self.replace_value_with_filename(key, value, lora_filenames) + else: + continue + + return metadata + + def parse_string(self, metadata: list) -> str: + for li, (label, key, value) in enumerate(metadata): + # remove model folder paths from metadata + if key.startswith('lora_combined_'): + name, weight = value.split(' : ') + name = Path(name).stem + value = f'{name} : {weight}' + metadata[li] = (label, key, value) + + res = {k: v for _, k, v in metadata} + + res['full_prompt'] = self.full_prompt + res['full_negative_prompt'] = self.full_negative_prompt + res['steps'] = self.steps + res['base_model'] = self.base_model_name + res['base_model_hash'] = self.base_model_hash + + if self.refiner_model_name not in ['', 'None']: + res['refiner_model'] = self.refiner_model_name + res['refiner_model_hash'] = self.refiner_model_hash + + res['loras'] = self.loras + + if modules.config.metadata_created_by != '': + res['created_by'] = modules.config.metadata_created_by + + return json.dumps(dict(sorted(res.items()))) + + @staticmethod + def replace_value_with_filename(key, value, filenames): + for filename in filenames: + path = Path(filename) + if key.startswith('lora_combined_'): + name, weight = value.split(' : ') + if name == path.stem: + return f'{filename} : {weight}' + elif value == path.stem: + return filename + + +def get_metadata_parser(metadata_scheme: MetadataScheme) -> MetadataParser: + match metadata_scheme: + case MetadataScheme.FOOOCUS: + return FooocusMetadataParser() + case MetadataScheme.A1111: + return A1111MetadataParser() + case _: + raise NotImplementedError + + +def read_info_from_image(filepath) -> tuple[str | None, MetadataScheme | None]: + with Image.open(filepath) as image: + items = (image.info or {}).copy() + + parameters = items.pop('parameters', None) + metadata_scheme = items.pop('fooocus_scheme', None) + exif = items.pop('exif', None) + + if parameters is not None and is_json(parameters): + parameters = json.loads(parameters) + elif exif is not None: + exif = image.getexif() + # 0x9286 = UserComment + parameters = exif.get(0x9286, None) + # 0x927C = MakerNote + metadata_scheme = exif.get(0x927C, None) + + if is_json(parameters): + parameters = json.loads(parameters) + + try: + metadata_scheme = MetadataScheme(metadata_scheme) + except ValueError: + metadata_scheme = None + + # broad fallback + if isinstance(parameters, dict): + metadata_scheme = MetadataScheme.FOOOCUS + + if isinstance(parameters, str): + metadata_scheme = MetadataScheme.A1111 + + return parameters, metadata_scheme + + +def get_exif(metadata: str | None, metadata_scheme: str): + exif = Image.Exif() + # tags see see https://github.com/python-pillow/Pillow/blob/9.2.x/src/PIL/ExifTags.py + # 0x9286 = UserComment + exif[0x9286] = metadata + # 0x0131 = Software + exif[0x0131] = 'Fooocus v' + fooocus_version.version + # 0x927C = MakerNote + exif[0x927C] = metadata_scheme + return exif \ No newline at end of file diff --git a/modules/model_loader.py b/modules/model_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..8ba336a915ae234b7cd5f9f2576d4edf779738ba --- /dev/null +++ b/modules/model_loader.py @@ -0,0 +1,26 @@ +import os +from urllib.parse import urlparse +from typing import Optional + + +def load_file_from_url( + url: str, + *, + model_dir: str, + progress: bool = True, + file_name: Optional[str] = None, +) -> str: + """Download a file from `url` into `model_dir`, using the file present if possible. + + Returns the path to the downloaded file. + """ + os.makedirs(model_dir, exist_ok=True) + if not file_name: + parts = urlparse(url) + file_name = os.path.basename(parts.path) + cached_file = os.path.abspath(os.path.join(model_dir, file_name)) + if not os.path.exists(cached_file): + print(f'Downloading: "{url}" to {cached_file}\n') + from torch.hub import download_url_to_file + download_url_to_file(url, cached_file, progress=progress) + return cached_file diff --git a/modules/ops.py b/modules/ops.py new file mode 100644 index 0000000000000000000000000000000000000000..ee0e775634314d1b71811258cff87b2178e1c740 --- /dev/null +++ b/modules/ops.py @@ -0,0 +1,19 @@ +import torch +import contextlib + + +@contextlib.contextmanager +def use_patched_ops(operations): + op_names = ['Linear', 'Conv2d', 'Conv3d', 'GroupNorm', 'LayerNorm'] + backups = {op_name: getattr(torch.nn, op_name) for op_name in op_names} + + try: + for op_name in op_names: + setattr(torch.nn, op_name, getattr(operations, op_name)) + + yield + + finally: + for op_name in op_names: + setattr(torch.nn, op_name, backups[op_name]) + return diff --git a/modules/patch.py b/modules/patch.py new file mode 100644 index 0000000000000000000000000000000000000000..3c2dd8f477902e68a467e8f89888934a762f4bb1 --- /dev/null +++ b/modules/patch.py @@ -0,0 +1,513 @@ +import os +import torch +import time +import math +import ldm_patched.modules.model_base +import ldm_patched.ldm.modules.diffusionmodules.openaimodel +import ldm_patched.modules.model_management +import modules.anisotropic as anisotropic +import ldm_patched.ldm.modules.attention +import ldm_patched.k_diffusion.sampling +import ldm_patched.modules.sd1_clip +import modules.inpaint_worker as inpaint_worker +import ldm_patched.ldm.modules.diffusionmodules.openaimodel +import ldm_patched.ldm.modules.diffusionmodules.model +import ldm_patched.modules.sd +import ldm_patched.controlnet.cldm +import ldm_patched.modules.model_patcher +import ldm_patched.modules.samplers +import ldm_patched.modules.args_parser +import warnings +import safetensors.torch +import modules.constants as constants + +from ldm_patched.modules.samplers import calc_cond_uncond_batch +from ldm_patched.k_diffusion.sampling import BatchedBrownianTree +from ldm_patched.ldm.modules.diffusionmodules.openaimodel import forward_timestep_embed, apply_control +from modules.patch_precision import patch_all_precision +from modules.patch_clip import patch_all_clip + + +class PatchSettings: + def __init__(self, + sharpness=2.0, + adm_scaler_end=0.3, + positive_adm_scale=1.5, + negative_adm_scale=0.8, + controlnet_softness=0.25, + adaptive_cfg=7.0): + self.sharpness = sharpness + self.adm_scaler_end = adm_scaler_end + self.positive_adm_scale = positive_adm_scale + self.negative_adm_scale = negative_adm_scale + self.controlnet_softness = controlnet_softness + self.adaptive_cfg = adaptive_cfg + self.global_diffusion_progress = 0 + self.eps_record = None + + +patch_settings = {} + + +def calculate_weight_patched(self, patches, weight, key): + for p in patches: + alpha = p[0] + v = p[1] + strength_model = p[2] + + if strength_model != 1.0: + weight *= strength_model + + if isinstance(v, list): + v = (self.calculate_weight(v[1:], v[0].clone(), key),) + + if len(v) == 1: + patch_type = "diff" + elif len(v) == 2: + patch_type = v[0] + v = v[1] + + if patch_type == "diff": + w1 = v[0] + if alpha != 0.0: + if w1.shape != weight.shape: + print("WARNING SHAPE MISMATCH {} WEIGHT NOT MERGED {} != {}".format(key, w1.shape, weight.shape)) + else: + weight += alpha * ldm_patched.modules.model_management.cast_to_device(w1, weight.device, weight.dtype) + elif patch_type == "lora": + mat1 = ldm_patched.modules.model_management.cast_to_device(v[0], weight.device, torch.float32) + mat2 = ldm_patched.modules.model_management.cast_to_device(v[1], weight.device, torch.float32) + if v[2] is not None: + alpha *= v[2] / mat2.shape[0] + if v[3] is not None: + mat3 = ldm_patched.modules.model_management.cast_to_device(v[3], weight.device, torch.float32) + final_shape = [mat2.shape[1], mat2.shape[0], mat3.shape[2], mat3.shape[3]] + mat2 = torch.mm(mat2.transpose(0, 1).flatten(start_dim=1), + mat3.transpose(0, 1).flatten(start_dim=1)).reshape(final_shape).transpose(0, 1) + try: + weight += (alpha * torch.mm(mat1.flatten(start_dim=1), mat2.flatten(start_dim=1))).reshape( + weight.shape).type(weight.dtype) + except Exception as e: + print("ERROR", key, e) + elif patch_type == "fooocus": + w1 = ldm_patched.modules.model_management.cast_to_device(v[0], weight.device, torch.float32) + w_min = ldm_patched.modules.model_management.cast_to_device(v[1], weight.device, torch.float32) + w_max = ldm_patched.modules.model_management.cast_to_device(v[2], weight.device, torch.float32) + w1 = (w1 / 255.0) * (w_max - w_min) + w_min + if alpha != 0.0: + if w1.shape != weight.shape: + print("WARNING SHAPE MISMATCH {} FOOOCUS WEIGHT NOT MERGED {} != {}".format(key, w1.shape, weight.shape)) + else: + weight += alpha * ldm_patched.modules.model_management.cast_to_device(w1, weight.device, weight.dtype) + elif patch_type == "lokr": + w1 = v[0] + w2 = v[1] + w1_a = v[3] + w1_b = v[4] + w2_a = v[5] + w2_b = v[6] + t2 = v[7] + dim = None + + if w1 is None: + dim = w1_b.shape[0] + w1 = torch.mm(ldm_patched.modules.model_management.cast_to_device(w1_a, weight.device, torch.float32), + ldm_patched.modules.model_management.cast_to_device(w1_b, weight.device, torch.float32)) + else: + w1 = ldm_patched.modules.model_management.cast_to_device(w1, weight.device, torch.float32) + + if w2 is None: + dim = w2_b.shape[0] + if t2 is None: + w2 = torch.mm(ldm_patched.modules.model_management.cast_to_device(w2_a, weight.device, torch.float32), + ldm_patched.modules.model_management.cast_to_device(w2_b, weight.device, torch.float32)) + else: + w2 = torch.einsum('i j k l, j r, i p -> p r k l', + ldm_patched.modules.model_management.cast_to_device(t2, weight.device, torch.float32), + ldm_patched.modules.model_management.cast_to_device(w2_b, weight.device, torch.float32), + ldm_patched.modules.model_management.cast_to_device(w2_a, weight.device, torch.float32)) + else: + w2 = ldm_patched.modules.model_management.cast_to_device(w2, weight.device, torch.float32) + + if len(w2.shape) == 4: + w1 = w1.unsqueeze(2).unsqueeze(2) + if v[2] is not None and dim is not None: + alpha *= v[2] / dim + + try: + weight += alpha * torch.kron(w1, w2).reshape(weight.shape).type(weight.dtype) + except Exception as e: + print("ERROR", key, e) + elif patch_type == "loha": + w1a = v[0] + w1b = v[1] + if v[2] is not None: + alpha *= v[2] / w1b.shape[0] + w2a = v[3] + w2b = v[4] + if v[5] is not None: # cp decomposition + t1 = v[5] + t2 = v[6] + m1 = torch.einsum('i j k l, j r, i p -> p r k l', + ldm_patched.modules.model_management.cast_to_device(t1, weight.device, torch.float32), + ldm_patched.modules.model_management.cast_to_device(w1b, weight.device, torch.float32), + ldm_patched.modules.model_management.cast_to_device(w1a, weight.device, torch.float32)) + + m2 = torch.einsum('i j k l, j r, i p -> p r k l', + ldm_patched.modules.model_management.cast_to_device(t2, weight.device, torch.float32), + ldm_patched.modules.model_management.cast_to_device(w2b, weight.device, torch.float32), + ldm_patched.modules.model_management.cast_to_device(w2a, weight.device, torch.float32)) + else: + m1 = torch.mm(ldm_patched.modules.model_management.cast_to_device(w1a, weight.device, torch.float32), + ldm_patched.modules.model_management.cast_to_device(w1b, weight.device, torch.float32)) + m2 = torch.mm(ldm_patched.modules.model_management.cast_to_device(w2a, weight.device, torch.float32), + ldm_patched.modules.model_management.cast_to_device(w2b, weight.device, torch.float32)) + + try: + weight += (alpha * m1 * m2).reshape(weight.shape).type(weight.dtype) + except Exception as e: + print("ERROR", key, e) + elif patch_type == "glora": + if v[4] is not None: + alpha *= v[4] / v[0].shape[0] + + a1 = ldm_patched.modules.model_management.cast_to_device(v[0].flatten(start_dim=1), weight.device, torch.float32) + a2 = ldm_patched.modules.model_management.cast_to_device(v[1].flatten(start_dim=1), weight.device, torch.float32) + b1 = ldm_patched.modules.model_management.cast_to_device(v[2].flatten(start_dim=1), weight.device, torch.float32) + b2 = ldm_patched.modules.model_management.cast_to_device(v[3].flatten(start_dim=1), weight.device, torch.float32) + + weight += ((torch.mm(b2, b1) + torch.mm(torch.mm(weight.flatten(start_dim=1), a2), a1)) * alpha).reshape(weight.shape).type(weight.dtype) + else: + print("patch type not recognized", patch_type, key) + + return weight + + +class BrownianTreeNoiseSamplerPatched: + transform = None + tree = None + + @staticmethod + def global_init(x, sigma_min, sigma_max, seed=None, transform=lambda x: x, cpu=False): + if ldm_patched.modules.model_management.directml_enabled: + cpu = True + + t0, t1 = transform(torch.as_tensor(sigma_min)), transform(torch.as_tensor(sigma_max)) + + BrownianTreeNoiseSamplerPatched.transform = transform + BrownianTreeNoiseSamplerPatched.tree = BatchedBrownianTree(x, t0, t1, seed, cpu=cpu) + + def __init__(self, *args, **kwargs): + pass + + @staticmethod + def __call__(sigma, sigma_next): + transform = BrownianTreeNoiseSamplerPatched.transform + tree = BrownianTreeNoiseSamplerPatched.tree + + t0, t1 = transform(torch.as_tensor(sigma)), transform(torch.as_tensor(sigma_next)) + return tree(t0, t1) / (t1 - t0).abs().sqrt() + + +def compute_cfg(uncond, cond, cfg_scale, t): + pid = os.getpid() + mimic_cfg = float(patch_settings[pid].adaptive_cfg) + real_cfg = float(cfg_scale) + + real_eps = uncond + real_cfg * (cond - uncond) + + if cfg_scale > patch_settings[pid].adaptive_cfg: + mimicked_eps = uncond + mimic_cfg * (cond - uncond) + return real_eps * t + mimicked_eps * (1 - t) + else: + return real_eps + + +def patched_sampling_function(model, x, timestep, uncond, cond, cond_scale, model_options=None, seed=None): + pid = os.getpid() + + if math.isclose(cond_scale, 1.0) and not model_options.get("disable_cfg1_optimization", False): + final_x0 = calc_cond_uncond_batch(model, cond, None, x, timestep, model_options)[0] + + if patch_settings[pid].eps_record is not None: + patch_settings[pid].eps_record = ((x - final_x0) / timestep).cpu() + + return final_x0 + + positive_x0, negative_x0 = calc_cond_uncond_batch(model, cond, uncond, x, timestep, model_options) + + positive_eps = x - positive_x0 + negative_eps = x - negative_x0 + + alpha = 0.001 * patch_settings[pid].sharpness * patch_settings[pid].global_diffusion_progress + + positive_eps_degraded = anisotropic.adaptive_anisotropic_filter(x=positive_eps, g=positive_x0) + positive_eps_degraded_weighted = positive_eps_degraded * alpha + positive_eps * (1.0 - alpha) + + final_eps = compute_cfg(uncond=negative_eps, cond=positive_eps_degraded_weighted, + cfg_scale=cond_scale, t=patch_settings[pid].global_diffusion_progress) + + if patch_settings[pid].eps_record is not None: + patch_settings[pid].eps_record = (final_eps / timestep).cpu() + + return x - final_eps + + +def round_to_64(x): + h = float(x) + h = h / 64.0 + h = round(h) + h = int(h) + h = h * 64 + return h + + +def sdxl_encode_adm_patched(self, **kwargs): + clip_pooled = ldm_patched.modules.model_base.sdxl_pooled(kwargs, self.noise_augmentor) + width = kwargs.get("width", 1024) + height = kwargs.get("height", 1024) + target_width = width + target_height = height + pid = os.getpid() + + if kwargs.get("prompt_type", "") == "negative": + width = float(width) * patch_settings[pid].negative_adm_scale + height = float(height) * patch_settings[pid].negative_adm_scale + elif kwargs.get("prompt_type", "") == "positive": + width = float(width) * patch_settings[pid].positive_adm_scale + height = float(height) * patch_settings[pid].positive_adm_scale + + def embedder(number_list): + h = self.embedder(torch.tensor(number_list, dtype=torch.float32)) + h = torch.flatten(h).unsqueeze(dim=0).repeat(clip_pooled.shape[0], 1) + return h + + width, height = int(width), int(height) + target_width, target_height = round_to_64(target_width), round_to_64(target_height) + + adm_emphasized = embedder([height, width, 0, 0, target_height, target_width]) + adm_consistent = embedder([target_height, target_width, 0, 0, target_height, target_width]) + + clip_pooled = clip_pooled.to(adm_emphasized) + final_adm = torch.cat((clip_pooled, adm_emphasized, clip_pooled, adm_consistent), dim=1) + + return final_adm + + +def patched_KSamplerX0Inpaint_forward(self, x, sigma, uncond, cond, cond_scale, denoise_mask, model_options={}, seed=None): + if inpaint_worker.current_task is not None: + latent_processor = self.inner_model.inner_model.process_latent_in + inpaint_latent = latent_processor(inpaint_worker.current_task.latent).to(x) + inpaint_mask = inpaint_worker.current_task.latent_mask.to(x) + + if getattr(self, 'energy_generator', None) is None: + # avoid bad results by using different seeds. + self.energy_generator = torch.Generator(device='cpu').manual_seed((seed + 1) % constants.MAX_SEED) + + energy_sigma = sigma.reshape([sigma.shape[0]] + [1] * (len(x.shape) - 1)) + current_energy = torch.randn( + x.size(), dtype=x.dtype, generator=self.energy_generator, device="cpu").to(x) * energy_sigma + x = x * inpaint_mask + (inpaint_latent + current_energy) * (1.0 - inpaint_mask) + + out = self.inner_model(x, sigma, + cond=cond, + uncond=uncond, + cond_scale=cond_scale, + model_options=model_options, + seed=seed) + + out = out * inpaint_mask + inpaint_latent * (1.0 - inpaint_mask) + else: + out = self.inner_model(x, sigma, + cond=cond, + uncond=uncond, + cond_scale=cond_scale, + model_options=model_options, + seed=seed) + return out + + +def timed_adm(y, timesteps): + if isinstance(y, torch.Tensor) and int(y.dim()) == 2 and int(y.shape[1]) == 5632: + y_mask = (timesteps > 999.0 * (1.0 - float(patch_settings[os.getpid()].adm_scaler_end))).to(y)[..., None] + y_with_adm = y[..., :2816].clone() + y_without_adm = y[..., 2816:].clone() + return y_with_adm * y_mask + y_without_adm * (1.0 - y_mask) + return y + + +def patched_cldm_forward(self, x, hint, timesteps, context, y=None, **kwargs): + t_emb = ldm_patched.ldm.modules.diffusionmodules.openaimodel.timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(x.dtype) + emb = self.time_embed(t_emb) + pid = os.getpid() + + guided_hint = self.input_hint_block(hint, emb, context) + + y = timed_adm(y, timesteps) + + outs = [] + + hs = [] + if self.num_classes is not None: + assert y.shape[0] == x.shape[0] + emb = emb + self.label_emb(y) + + h = x + for module, zero_conv in zip(self.input_blocks, self.zero_convs): + if guided_hint is not None: + h = module(h, emb, context) + h += guided_hint + guided_hint = None + else: + h = module(h, emb, context) + outs.append(zero_conv(h, emb, context)) + + h = self.middle_block(h, emb, context) + outs.append(self.middle_block_out(h, emb, context)) + + if patch_settings[pid].controlnet_softness > 0: + for i in range(10): + k = 1.0 - float(i) / 9.0 + outs[i] = outs[i] * (1.0 - patch_settings[pid].controlnet_softness * k) + + return outs + + +def patched_unet_forward(self, x, timesteps=None, context=None, y=None, control=None, transformer_options={}, **kwargs): + self.current_step = 1.0 - timesteps.to(x) / 999.0 + patch_settings[os.getpid()].global_diffusion_progress = float(self.current_step.detach().cpu().numpy().tolist()[0]) + + y = timed_adm(y, timesteps) + + transformer_options["original_shape"] = list(x.shape) + transformer_options["transformer_index"] = 0 + transformer_patches = transformer_options.get("patches", {}) + + num_video_frames = kwargs.get("num_video_frames", self.default_num_video_frames) + image_only_indicator = kwargs.get("image_only_indicator", self.default_image_only_indicator) + time_context = kwargs.get("time_context", None) + + assert (y is not None) == ( + self.num_classes is not None + ), "must specify y if and only if the model is class-conditional" + hs = [] + t_emb = ldm_patched.ldm.modules.diffusionmodules.openaimodel.timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(x.dtype) + emb = self.time_embed(t_emb) + + if self.num_classes is not None: + assert y.shape[0] == x.shape[0] + emb = emb + self.label_emb(y) + + h = x + for id, module in enumerate(self.input_blocks): + transformer_options["block"] = ("input", id) + h = forward_timestep_embed(module, h, emb, context, transformer_options, time_context=time_context, num_video_frames=num_video_frames, image_only_indicator=image_only_indicator) + h = apply_control(h, control, 'input') + if "input_block_patch" in transformer_patches: + patch = transformer_patches["input_block_patch"] + for p in patch: + h = p(h, transformer_options) + + hs.append(h) + if "input_block_patch_after_skip" in transformer_patches: + patch = transformer_patches["input_block_patch_after_skip"] + for p in patch: + h = p(h, transformer_options) + + transformer_options["block"] = ("middle", 0) + h = forward_timestep_embed(self.middle_block, h, emb, context, transformer_options, time_context=time_context, num_video_frames=num_video_frames, image_only_indicator=image_only_indicator) + h = apply_control(h, control, 'middle') + + for id, module in enumerate(self.output_blocks): + transformer_options["block"] = ("output", id) + hsp = hs.pop() + hsp = apply_control(hsp, control, 'output') + + if "output_block_patch" in transformer_patches: + patch = transformer_patches["output_block_patch"] + for p in patch: + h, hsp = p(h, hsp, transformer_options) + + h = torch.cat([h, hsp], dim=1) + del hsp + if len(hs) > 0: + output_shape = hs[-1].shape + else: + output_shape = None + h = forward_timestep_embed(module, h, emb, context, transformer_options, output_shape, time_context=time_context, num_video_frames=num_video_frames, image_only_indicator=image_only_indicator) + h = h.type(x.dtype) + if self.predict_codebook_ids: + return self.id_predictor(h) + else: + return self.out(h) + + +def patched_load_models_gpu(*args, **kwargs): + execution_start_time = time.perf_counter() + y = ldm_patched.modules.model_management.load_models_gpu_origin(*args, **kwargs) + moving_time = time.perf_counter() - execution_start_time + if moving_time > 0.1: + print(f'[Fooocus Model Management] Moving model(s) has taken {moving_time:.2f} seconds') + return y + + +def build_loaded(module, loader_name): + original_loader_name = loader_name + '_origin' + + if not hasattr(module, original_loader_name): + setattr(module, original_loader_name, getattr(module, loader_name)) + + original_loader = getattr(module, original_loader_name) + + def loader(*args, **kwargs): + result = None + try: + result = original_loader(*args, **kwargs) + except Exception as e: + result = None + exp = str(e) + '\n' + for path in list(args) + list(kwargs.values()): + if isinstance(path, str): + if os.path.exists(path): + exp += f'File corrupted: {path} \n' + corrupted_backup_file = path + '.corrupted' + if os.path.exists(corrupted_backup_file): + os.remove(corrupted_backup_file) + os.replace(path, corrupted_backup_file) + if os.path.exists(path): + os.remove(path) + exp += f'Fooocus has tried to move the corrupted file to {corrupted_backup_file} \n' + exp += f'You may try again now and Fooocus will download models again. \n' + raise ValueError(exp) + return result + + setattr(module, loader_name, loader) + return + + +def patch_all(): + if ldm_patched.modules.model_management.directml_enabled: + ldm_patched.modules.model_management.lowvram_available = True + ldm_patched.modules.model_management.OOM_EXCEPTION = Exception + + patch_all_precision() + patch_all_clip() + + if not hasattr(ldm_patched.modules.model_management, 'load_models_gpu_origin'): + ldm_patched.modules.model_management.load_models_gpu_origin = ldm_patched.modules.model_management.load_models_gpu + + ldm_patched.modules.model_management.load_models_gpu = patched_load_models_gpu + ldm_patched.modules.model_patcher.ModelPatcher.calculate_weight = calculate_weight_patched + ldm_patched.controlnet.cldm.ControlNet.forward = patched_cldm_forward + ldm_patched.ldm.modules.diffusionmodules.openaimodel.UNetModel.forward = patched_unet_forward + ldm_patched.modules.model_base.SDXL.encode_adm = sdxl_encode_adm_patched + ldm_patched.modules.samplers.KSamplerX0Inpaint.forward = patched_KSamplerX0Inpaint_forward + ldm_patched.k_diffusion.sampling.BrownianTreeNoiseSampler = BrownianTreeNoiseSamplerPatched + ldm_patched.modules.samplers.sampling_function = patched_sampling_function + + warnings.filterwarnings(action='ignore', module='torchsde') + + build_loaded(safetensors.torch, 'load_file') + build_loaded(torch, 'load') + + return diff --git a/modules/patch_clip.py b/modules/patch_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..06b7f01bb857b01995ff7b0326813f98f92ea76d --- /dev/null +++ b/modules/patch_clip.py @@ -0,0 +1,195 @@ +# Consistent with Kohya/A1111 to reduce differences between model training and inference. + +import os +import torch +import ldm_patched.controlnet.cldm +import ldm_patched.k_diffusion.sampling +import ldm_patched.ldm.modules.attention +import ldm_patched.ldm.modules.diffusionmodules.model +import ldm_patched.ldm.modules.diffusionmodules.openaimodel +import ldm_patched.ldm.modules.diffusionmodules.openaimodel +import ldm_patched.modules.args_parser +import ldm_patched.modules.model_base +import ldm_patched.modules.model_management +import ldm_patched.modules.model_patcher +import ldm_patched.modules.samplers +import ldm_patched.modules.sd +import ldm_patched.modules.sd1_clip +import ldm_patched.modules.clip_vision +import ldm_patched.modules.ops as ops + +from modules.ops import use_patched_ops +from transformers import CLIPTextModel, CLIPTextConfig, modeling_utils, CLIPVisionConfig, CLIPVisionModelWithProjection + + +def patched_encode_token_weights(self, token_weight_pairs): + to_encode = list() + max_token_len = 0 + has_weights = False + for x in token_weight_pairs: + tokens = list(map(lambda a: a[0], x)) + max_token_len = max(len(tokens), max_token_len) + has_weights = has_weights or not all(map(lambda a: a[1] == 1.0, x)) + to_encode.append(tokens) + + sections = len(to_encode) + if has_weights or sections == 0: + to_encode.append(ldm_patched.modules.sd1_clip.gen_empty_tokens(self.special_tokens, max_token_len)) + + out, pooled = self.encode(to_encode) + if pooled is not None: + first_pooled = pooled[0:1].to(ldm_patched.modules.model_management.intermediate_device()) + else: + first_pooled = pooled + + output = [] + for k in range(0, sections): + z = out[k:k + 1] + if has_weights: + original_mean = z.mean() + z_empty = out[-1] + for i in range(len(z)): + for j in range(len(z[i])): + weight = token_weight_pairs[k][j][1] + if weight != 1.0: + z[i][j] = (z[i][j] - z_empty[j]) * weight + z_empty[j] + new_mean = z.mean() + z = z * (original_mean / new_mean) + output.append(z) + + if len(output) == 0: + return out[-1:].to(ldm_patched.modules.model_management.intermediate_device()), first_pooled + return torch.cat(output, dim=-2).to(ldm_patched.modules.model_management.intermediate_device()), first_pooled + + +def patched_SDClipModel__init__(self, max_length=77, freeze=True, layer="last", layer_idx=None, + textmodel_json_config=None, dtype=None, special_tokens=None, + layer_norm_hidden_state=True, **kwargs): + torch.nn.Module.__init__(self) + assert layer in self.LAYERS + + if special_tokens is None: + special_tokens = {"start": 49406, "end": 49407, "pad": 49407} + + if textmodel_json_config is None: + textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(ldm_patched.modules.sd1_clip.__file__)), + "sd1_clip_config.json") + + config = CLIPTextConfig.from_json_file(textmodel_json_config) + self.num_layers = config.num_hidden_layers + + with use_patched_ops(ops.manual_cast): + with modeling_utils.no_init_weights(): + self.transformer = CLIPTextModel(config) + + if dtype is not None: + self.transformer.to(dtype) + + self.transformer.text_model.embeddings.to(torch.float32) + + if freeze: + self.freeze() + + self.max_length = max_length + self.layer = layer + self.layer_idx = None + self.special_tokens = special_tokens + self.text_projection = torch.nn.Parameter(torch.eye(self.transformer.get_input_embeddings().weight.shape[1])) + self.logit_scale = torch.nn.Parameter(torch.tensor(4.6055)) + self.enable_attention_masks = False + + self.layer_norm_hidden_state = layer_norm_hidden_state + if layer == "hidden": + assert layer_idx is not None + assert abs(layer_idx) < self.num_layers + self.clip_layer(layer_idx) + self.layer_default = (self.layer, self.layer_idx) + + +def patched_SDClipModel_forward(self, tokens): + backup_embeds = self.transformer.get_input_embeddings() + device = backup_embeds.weight.device + tokens = self.set_up_textual_embeddings(tokens, backup_embeds) + tokens = torch.LongTensor(tokens).to(device) + + attention_mask = None + if self.enable_attention_masks: + attention_mask = torch.zeros_like(tokens) + max_token = self.transformer.get_input_embeddings().weight.shape[0] - 1 + for x in range(attention_mask.shape[0]): + for y in range(attention_mask.shape[1]): + attention_mask[x, y] = 1 + if tokens[x, y] == max_token: + break + + outputs = self.transformer(input_ids=tokens, attention_mask=attention_mask, + output_hidden_states=self.layer == "hidden") + self.transformer.set_input_embeddings(backup_embeds) + + if self.layer == "last": + z = outputs.last_hidden_state + elif self.layer == "pooled": + z = outputs.pooler_output[:, None, :] + else: + z = outputs.hidden_states[self.layer_idx] + if self.layer_norm_hidden_state: + z = self.transformer.text_model.final_layer_norm(z) + + if hasattr(outputs, "pooler_output"): + pooled_output = outputs.pooler_output.float() + else: + pooled_output = None + + if self.text_projection is not None and pooled_output is not None: + pooled_output = pooled_output.float().to(self.text_projection.device) @ self.text_projection.float() + + return z.float(), pooled_output + + +def patched_ClipVisionModel__init__(self, json_config): + config = CLIPVisionConfig.from_json_file(json_config) + + self.load_device = ldm_patched.modules.model_management.text_encoder_device() + self.offload_device = ldm_patched.modules.model_management.text_encoder_offload_device() + + if ldm_patched.modules.model_management.should_use_fp16(self.load_device, prioritize_performance=False): + self.dtype = torch.float16 + else: + self.dtype = torch.float32 + + with use_patched_ops(ops.manual_cast): + with modeling_utils.no_init_weights(): + self.model = CLIPVisionModelWithProjection(config) + + self.model.to(self.dtype) + self.patcher = ldm_patched.modules.model_patcher.ModelPatcher( + self.model, + load_device=self.load_device, + offload_device=self.offload_device + ) + + +def patched_ClipVisionModel_encode_image(self, image): + ldm_patched.modules.model_management.load_model_gpu(self.patcher) + pixel_values = ldm_patched.modules.clip_vision.clip_preprocess(image.to(self.load_device)) + outputs = self.model(pixel_values=pixel_values, output_hidden_states=True) + + for k in outputs: + t = outputs[k] + if t is not None: + if k == 'hidden_states': + outputs["penultimate_hidden_states"] = t[-2].to(ldm_patched.modules.model_management.intermediate_device()) + outputs["hidden_states"] = None + else: + outputs[k] = t.to(ldm_patched.modules.model_management.intermediate_device()) + + return outputs + + +def patch_all_clip(): + ldm_patched.modules.sd1_clip.ClipTokenWeightEncoder.encode_token_weights = patched_encode_token_weights + ldm_patched.modules.sd1_clip.SDClipModel.__init__ = patched_SDClipModel__init__ + ldm_patched.modules.sd1_clip.SDClipModel.forward = patched_SDClipModel_forward + ldm_patched.modules.clip_vision.ClipVisionModel.__init__ = patched_ClipVisionModel__init__ + ldm_patched.modules.clip_vision.ClipVisionModel.encode_image = patched_ClipVisionModel_encode_image + return diff --git a/modules/patch_precision.py b/modules/patch_precision.py new file mode 100644 index 0000000000000000000000000000000000000000..83569bdd15f5ab0cac2c57353626c4e843bd264d --- /dev/null +++ b/modules/patch_precision.py @@ -0,0 +1,60 @@ +# Consistent with Kohya to reduce differences between model training and inference. + +import torch +import math +import einops +import numpy as np + +import ldm_patched.ldm.modules.diffusionmodules.openaimodel +import ldm_patched.modules.model_sampling +import ldm_patched.modules.sd1_clip + +from ldm_patched.ldm.modules.diffusionmodules.util import make_beta_schedule + + +def patched_timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False): + # Consistent with Kohya to reduce differences between model training and inference. + + if not repeat_only: + half = dim // 2 + freqs = torch.exp( + -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half + ).to(device=timesteps.device) + args = timesteps[:, None].float() * freqs[None] + embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + if dim % 2: + embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) + else: + embedding = einops.repeat(timesteps, 'b -> b d', d=dim) + return embedding + + +def patched_register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000, + linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3): + # Consistent with Kohya to reduce differences between model training and inference. + + if given_betas is not None: + betas = given_betas + else: + betas = make_beta_schedule( + beta_schedule, + timesteps, + linear_start=linear_start, + linear_end=linear_end, + cosine_s=cosine_s) + + alphas = 1. - betas + alphas_cumprod = np.cumprod(alphas, axis=0) + timesteps, = betas.shape + self.num_timesteps = int(timesteps) + self.linear_start = linear_start + self.linear_end = linear_end + sigmas = torch.tensor(((1 - alphas_cumprod) / alphas_cumprod) ** 0.5, dtype=torch.float32) + self.set_sigmas(sigmas) + return + + +def patch_all_precision(): + ldm_patched.ldm.modules.diffusionmodules.openaimodel.timestep_embedding = patched_timestep_embedding + ldm_patched.modules.model_sampling.ModelSamplingDiscrete._register_schedule = patched_register_schedule + return diff --git a/modules/private_logger.py b/modules/private_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..01e570a7d96375a15a81b6f07a678b1f7eda743e --- /dev/null +++ b/modules/private_logger.py @@ -0,0 +1,130 @@ +import os +import args_manager +import modules.config +import json +import urllib.parse + +from PIL import Image +from PIL.PngImagePlugin import PngInfo +from modules.util import generate_temp_filename +from modules.meta_parser import MetadataParser, get_exif + +log_cache = {} + + +def get_current_html_path(output_format=None): + output_format = output_format if output_format else modules.config.default_output_format + date_string, local_temp_filename, only_name = generate_temp_filename(folder=modules.config.path_outputs, + extension=output_format) + html_name = os.path.join(os.path.dirname(local_temp_filename), 'log.html') + return html_name + + +def log(img, metadata, metadata_parser: MetadataParser | None = None, output_format=None) -> str: + path_outputs = args_manager.args.temp_path if args_manager.args.disable_image_log else modules.config.path_outputs + output_format = output_format if output_format else modules.config.default_output_format + date_string, local_temp_filename, only_name = generate_temp_filename(folder=path_outputs, extension=output_format) + os.makedirs(os.path.dirname(local_temp_filename), exist_ok=True) + + parsed_parameters = metadata_parser.parse_string(metadata.copy()) if metadata_parser is not None else '' + image = Image.fromarray(img) + + if output_format == 'png': + if parsed_parameters != '': + pnginfo = PngInfo() + pnginfo.add_text('parameters', parsed_parameters) + pnginfo.add_text('fooocus_scheme', metadata_parser.get_scheme().value) + else: + pnginfo = None + image.save(local_temp_filename, pnginfo=pnginfo) + elif output_format == 'jpg': + image.save(local_temp_filename, quality=95, optimize=True, progressive=True, exif=get_exif(parsed_parameters, metadata_parser.get_scheme().value) if metadata_parser else Image.Exif()) + elif output_format == 'webp': + image.save(local_temp_filename, quality=95, lossless=False, exif=get_exif(parsed_parameters, metadata_parser.get_scheme().value) if metadata_parser else Image.Exif()) + else: + image.save(local_temp_filename) + + if args_manager.args.disable_image_log: + return local_temp_filename + + html_name = os.path.join(os.path.dirname(local_temp_filename), 'log.html') + + css_styles = ( + "" + ) + + js = ( + """""" + ) + + begin_part = f"+ ++ *text* +Fooocus Log {date_string} {css_styles}{js}Fooocus Log {date_string} (private)
\nMetadata is embedded if enabled in the config or developer debug mode. You can find the information for each image in line Metadata Scheme.
\n\n" + end_part = f'\n' + + middle_part = log_cache.get(html_name, "") + + if middle_part == "": + if os.path.exists(html_name): + existing_split = open(html_name, 'r', encoding='utf-8').read().split('') + if len(existing_split) == 3: + middle_part = existing_split[1] + else: + middle_part = existing_split[0] + + div_name = only_name.replace('.', '_') + item = f"\n\n" + + middle_part = item + middle_part + + with open(html_name, 'w', encoding='utf-8') as f: + f.write(begin_part + middle_part + end_part) + + print(f'Image generated with private log at: {html_name}') + + log_cache[html_name] = middle_part + + return local_temp_filename diff --git a/modules/sample_hijack.py b/modules/sample_hijack.py new file mode 100644 index 0000000000000000000000000000000000000000..5936a096d9f0afaac0a672f72cee5f84b23496ad --- /dev/null +++ b/modules/sample_hijack.py @@ -0,0 +1,184 @@ +import torch +import ldm_patched.modules.samplers +import ldm_patched.modules.model_management + +from collections import namedtuple +from ldm_patched.contrib.external_custom_sampler import SDTurboScheduler +from ldm_patched.k_diffusion import sampling as k_diffusion_sampling +from ldm_patched.modules.samplers import normal_scheduler, simple_scheduler, ddim_scheduler +from ldm_patched.modules.model_base import SDXLRefiner, SDXL +from ldm_patched.modules.conds import CONDRegular +from ldm_patched.modules.sample import get_additional_models, get_models_from_cond, cleanup_additional_models +from ldm_patched.modules.samplers import resolve_areas_and_cond_masks, wrap_model, calculate_start_end_timesteps, \ + create_cond_with_same_area_if_none, pre_run_control, apply_empty_x_to_equal_area, encode_model_conds + + +current_refiner = None +refiner_switch_step = -1 + + +@torch.no_grad() +@torch.inference_mode() +def clip_separate_inner(c, p, target_model=None, target_clip=None): + if target_model is None or isinstance(target_model, SDXLRefiner): + c = c[..., -1280:].clone() + elif isinstance(target_model, SDXL): + c = c.clone() + else: + p = None + c = c[..., :768].clone() + + final_layer_norm = target_clip.cond_stage_model.clip_l.transformer.text_model.final_layer_norm + + final_layer_norm_origin_device = final_layer_norm.weight.device + final_layer_norm_origin_dtype = final_layer_norm.weight.dtype + + c_origin_device = c.device + c_origin_dtype = c.dtype + + final_layer_norm.to(device='cpu', dtype=torch.float32) + c = c.to(device='cpu', dtype=torch.float32) + + c = torch.chunk(c, int(c.size(1)) // 77, 1) + c = [final_layer_norm(ci) for ci in c] + c = torch.cat(c, dim=1) + + final_layer_norm.to(device=final_layer_norm_origin_device, dtype=final_layer_norm_origin_dtype) + c = c.to(device=c_origin_device, dtype=c_origin_dtype) + return c, p + + +@torch.no_grad() +@torch.inference_mode() +def clip_separate(cond, target_model=None, target_clip=None): + results = [] + + for c, px in cond: + p = px.get('pooled_output', None) + c, p = clip_separate_inner(c, p, target_model=target_model, target_clip=target_clip) + p = {} if p is None else {'pooled_output': p.clone()} + results.append([c, p]) + + return results + + +@torch.no_grad() +@torch.inference_mode() +def clip_separate_after_preparation(cond, target_model=None, target_clip=None): + results = [] + + for x in cond: + p = x.get('pooled_output', None) + c = x['model_conds']['c_crossattn'].cond + + c, p = clip_separate_inner(c, p, target_model=target_model, target_clip=target_clip) + + result = {'model_conds': {'c_crossattn': CONDRegular(c)}} + + if p is not None: + result['pooled_output'] = p.clone() + + results.append(result) + + return results + + +@torch.no_grad() +@torch.inference_mode() +def sample_hacked(model, noise, positive, negative, cfg, device, sampler, sigmas, model_options={}, latent_image=None, denoise_mask=None, callback=None, disable_pbar=False, seed=None): + global current_refiner + + positive = positive[:] + negative = negative[:] + + resolve_areas_and_cond_masks(positive, noise.shape[2], noise.shape[3], device) + resolve_areas_and_cond_masks(negative, noise.shape[2], noise.shape[3], device) + + model_wrap = wrap_model(model) + + calculate_start_end_timesteps(model, negative) + calculate_start_end_timesteps(model, positive) + + if latent_image is not None: + latent_image = model.process_latent_in(latent_image) + + if hasattr(model, 'extra_conds'): + positive = encode_model_conds(model.extra_conds, positive, noise, device, "positive", latent_image=latent_image, denoise_mask=denoise_mask) + negative = encode_model_conds(model.extra_conds, negative, noise, device, "negative", latent_image=latent_image, denoise_mask=denoise_mask) + + #make sure each cond area has an opposite one with the same area + for c in positive: + create_cond_with_same_area_if_none(negative, c) + for c in negative: + create_cond_with_same_area_if_none(positive, c) + + # pre_run_control(model, negative + positive) + pre_run_control(model, positive) # negative is not necessary in Fooocus, 0.5s faster. + + apply_empty_x_to_equal_area(list(filter(lambda c: c.get('control_apply_to_uncond', False) == True, positive)), negative, 'control', lambda cond_cnets, x: cond_cnets[x]) + apply_empty_x_to_equal_area(positive, negative, 'gligen', lambda cond_cnets, x: cond_cnets[x]) + + extra_args = {"cond":positive, "uncond":negative, "cond_scale": cfg, "model_options": model_options, "seed":seed} + + if current_refiner is not None and hasattr(current_refiner.model, 'extra_conds'): + positive_refiner = clip_separate_after_preparation(positive, target_model=current_refiner.model) + negative_refiner = clip_separate_after_preparation(negative, target_model=current_refiner.model) + + positive_refiner = encode_model_conds(current_refiner.model.extra_conds, positive_refiner, noise, device, "positive", latent_image=latent_image, denoise_mask=denoise_mask) + negative_refiner = encode_model_conds(current_refiner.model.extra_conds, negative_refiner, noise, device, "negative", latent_image=latent_image, denoise_mask=denoise_mask) + + def refiner_switch(): + cleanup_additional_models(set(get_models_from_cond(positive, "control") + get_models_from_cond(negative, "control"))) + + extra_args["cond"] = positive_refiner + extra_args["uncond"] = negative_refiner + + # clear ip-adapter for refiner + extra_args['model_options'] = {k: {} if k == 'transformer_options' else v for k, v in extra_args['model_options'].items()} + + models, inference_memory = get_additional_models(positive_refiner, negative_refiner, current_refiner.model_dtype()) + ldm_patched.modules.model_management.load_models_gpu( + [current_refiner] + models, + model.memory_required([noise.shape[0] * 2] + list(noise.shape[1:])) + inference_memory) + + model_wrap.inner_model = current_refiner.model + print('Refiner Swapped') + return + + def callback_wrap(step, x0, x, total_steps): + if step == refiner_switch_step and current_refiner is not None: + refiner_switch() + if callback is not None: + # residual_noise_preview = x - x0 + # residual_noise_preview /= residual_noise_preview.std() + # residual_noise_preview *= x0.std() + callback(step, x0, x, total_steps) + + samples = sampler.sample(model_wrap, sigmas, extra_args, callback_wrap, noise, latent_image, denoise_mask, disable_pbar) + return model.process_latent_out(samples.to(torch.float32)) + + +@torch.no_grad() +@torch.inference_mode() +def calculate_sigmas_scheduler_hacked(model, scheduler_name, steps): + if scheduler_name == "karras": + sigmas = k_diffusion_sampling.get_sigmas_karras(n=steps, sigma_min=float(model.model_sampling.sigma_min), sigma_max=float(model.model_sampling.sigma_max)) + elif scheduler_name == "exponential": + sigmas = k_diffusion_sampling.get_sigmas_exponential(n=steps, sigma_min=float(model.model_sampling.sigma_min), sigma_max=float(model.model_sampling.sigma_max)) + elif scheduler_name == "normal": + sigmas = normal_scheduler(model, steps) + elif scheduler_name == "simple": + sigmas = simple_scheduler(model, steps) + elif scheduler_name == "ddim_uniform": + sigmas = ddim_scheduler(model, steps) + elif scheduler_name == "sgm_uniform": + sigmas = normal_scheduler(model, steps, sgm=True) + elif scheduler_name == "turbo": + sigmas = SDTurboScheduler().get_sigmas(namedtuple('Patcher', ['model'])(model=model), steps=steps, denoise=1.0)[0] + else: + raise TypeError("error invalid scheduler") + return sigmas + + +ldm_patched.modules.samplers.calculate_sigmas_scheduler = calculate_sigmas_scheduler_hacked +ldm_patched.modules.samplers.sample = sample_hacked diff --git a/modules/sdxl_styles.py b/modules/sdxl_styles.py new file mode 100644 index 0000000000000000000000000000000000000000..2a310024cdd0f96cb20341f811a50146000b586b --- /dev/null +++ b/modules/sdxl_styles.py @@ -0,0 +1,117 @@ +import os +import re +import json +import math + +from modules.util import get_files_from_folder + + +# cannot use modules.config - validators causing circular imports +styles_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../sdxl_styles/')) +wildcards_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../wildcards/')) +wildcards_max_bfs_depth = 64 + + +def normalize_key(k): + k = k.replace('-', ' ') + words = k.split(' ') + words = [w[:1].upper() + w[1:].lower() for w in words] + k = ' '.join(words) + k = k.replace('3d', '3D') + k = k.replace('Sai', 'SAI') + k = k.replace('Mre', 'MRE') + k = k.replace('(s', '(S') + return k + + +styles = {} + +styles_files = get_files_from_folder(styles_path, ['.json']) + +for x in ['sdxl_styles_fooocus.json', + 'sdxl_styles_sai.json', + 'sdxl_styles_mre.json', + 'sdxl_styles_twri.json', + 'sdxl_styles_diva.json', + 'sdxl_styles_marc_k3nt3l.json']: + if x in styles_files: + styles_files.remove(x) + styles_files.append(x) + +for styles_file in styles_files: + try: + with open(os.path.join(styles_path, styles_file), encoding='utf-8') as f: + for entry in json.load(f): + name = normalize_key(entry['name']) + prompt = entry['prompt'] if 'prompt' in entry else '' + negative_prompt = entry['negative_prompt'] if 'negative_prompt' in entry else '' + styles[name] = (prompt, negative_prompt) + except Exception as e: + print(str(e)) + print(f'Failed to load style file {styles_file}') + +style_keys = list(styles.keys()) +fooocus_expansion = "Fooocus V2" +legal_style_names = [fooocus_expansion] + style_keys + + +def apply_style(style, positive): + p, n = styles[style] + return p.replace('{prompt}', positive).splitlines(), n.splitlines() + + +def apply_wildcards(wildcard_text, rng, directory=wildcards_path): + for _ in range(wildcards_max_bfs_depth): + placeholders = re.findall(r'__([\w-]+)__', wildcard_text) + if len(placeholders) == 0: + return wildcard_text + + print(f'[Wildcards] processing: {wildcard_text}') + for placeholder in placeholders: + try: + words = open(os.path.join(directory, f'{placeholder}.txt'), encoding='utf-8').read().splitlines() + words = [x for x in words if x != ''] + assert len(words) > 0 + wildcard_text = wildcard_text.replace(f'__{placeholder}__', rng.choice(words), 1) + except: + print(f'[Wildcards] Warning: {placeholder}.txt missing or empty. ' + f'Using "{placeholder}" as a normal word.') + wildcard_text = wildcard_text.replace(f'__{placeholder}__', placeholder) + print(f'[Wildcards] {wildcard_text}') + + print(f'[Wildcards] BFS stack overflow. Current text: {wildcard_text}') + return wildcard_text + +def get_words(arrays, totalMult, index): + if(len(arrays) == 1): + return [arrays[0].split(',')[index]] + else: + words = arrays[0].split(',') + word = words[index % len(words)] + index -= index % len(words) + index /= len(words) + index = math.floor(index) + return [word] + get_words(arrays[1:], math.floor(totalMult/len(words)), index) + + +def apply_arrays(text, index): + arrays = re.findall(r'\[\[(.*?)\]\]', text) + if len(arrays) == 0: + return text + + print(f'[Arrays] processing: {text}') + mult = 1 + for arr in arrays: + words = arr.split(',') + mult *= len(words) + + index %= mult + chosen_words = get_words(arrays, mult, index) + + i = 0 + for arr in arrays: + text = text.replace(f'[[{arr}]]', chosen_words[i], 1) + i = i+1 + + return text + diff --git a/modules/style_sorter.py b/modules/style_sorter.py new file mode 100644 index 0000000000000000000000000000000000000000..49142bc7926e06ee29f5678de1a9acc13dac5b70 --- /dev/null +++ b/modules/style_sorter.py @@ -0,0 +1,59 @@ +import os +import gradio as gr +import modules.localization as localization +import json + + +all_styles = [] + + +def try_load_sorted_styles(style_names, default_selected): + global all_styles + + all_styles = style_names + + try: + if os.path.exists('sorted_styles.json'): + with open('sorted_styles.json', 'rt', encoding='utf-8') as fp: + sorted_styles = [] + for x in json.load(fp): + if x in all_styles: + sorted_styles.append(x) + for x in all_styles: + if x not in sorted_styles: + sorted_styles.append(x) + all_styles = sorted_styles + except Exception as e: + print('Load style sorting failed.') + print(e) + + unselected = [y for y in all_styles if y not in default_selected] + all_styles = default_selected + unselected + + return + + +def sort_styles(selected): + global all_styles + unselected = [y for y in all_styles if y not in selected] + sorted_styles = selected + unselected + try: + with open('sorted_styles.json', 'wt', encoding='utf-8') as fp: + json.dump(sorted_styles, fp, indent=4) + except Exception as e: + print('Write style sorting failed.') + print(e) + all_styles = sorted_styles + return gr.CheckboxGroup.update(choices=sorted_styles) + + +def localization_key(x): + return x + localization.current_translation.get(x, '') + + +def search_styles(selected, query): + unselected = [y for y in all_styles if y not in selected] + matched = [y for y in unselected if query.lower() in localization_key(y).lower()] if len(query.replace(' ', '')) > 0 else [] + unmatched = [y for y in unselected if y not in matched] + sorted_styles = matched + selected + unmatched + return gr.CheckboxGroup.update(choices=sorted_styles) diff --git a/modules/ui_gradio_extensions.py b/modules/ui_gradio_extensions.py new file mode 100644 index 0000000000000000000000000000000000000000..bebf9f8ca7860c700f52ea5d3d3586917f17d34b --- /dev/null +++ b/modules/ui_gradio_extensions.py @@ -0,0 +1,67 @@ +# based on https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/v1.6.0/modules/ui_gradio_extensions.py + +import os +import gradio as gr +import args_manager + +from modules.localization import localization_js + + +GradioTemplateResponseOriginal = gr.routes.templates.TemplateResponse + +modules_path = os.path.dirname(os.path.realpath(__file__)) +script_path = os.path.dirname(modules_path) + + +def webpath(fn): + if fn.startswith(script_path): + web_path = os.path.relpath(fn, script_path).replace('\\', '/') + else: + web_path = os.path.abspath(fn) + + return f'file={web_path}?{os.path.getmtime(fn)}' + + +def javascript_html(): + script_js_path = webpath('javascript/script.js') + context_menus_js_path = webpath('javascript/contextMenus.js') + localization_js_path = webpath('javascript/localization.js') + zoom_js_path = webpath('javascript/zoom.js') + edit_attention_js_path = webpath('javascript/edit-attention.js') + viewer_js_path = webpath('javascript/viewer.js') + image_viewer_js_path = webpath('javascript/imageviewer.js') + samples_path = webpath(os.path.abspath('./sdxl_styles/samples/fooocus_v2.jpg')) + head = f'\n' + head += f'\n' + head += f'\n' + head += f'\n' + head += f'\n' + head += f'\n' + head += f'\n' + head += f'\n' + head += f'\n' + + if args_manager.args.theme: + head += f'\n' + + return head + + +def css_html(): + style_css_path = webpath('css/style.css') + head = f'' + return head + + +def reload_javascript(): + js = javascript_html() + css = css_html() + + def template_response(*args, **kwargs): + res = GradioTemplateResponseOriginal(*args, **kwargs) + res.body = res.body.replace(b'', f'{js}'.encode("utf8")) + res.body = res.body.replace(b'