diff --git a/.gitattributes b/.gitattributes index 9072bac14015f13355e68eea3f0ad0b010303f10..d1ef78d7aac72a83b1162a11a092e88ba4aa346a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -37,3 +37,12 @@ extensions/Stable-Diffusion-Webui-Civitai-Helper/img/all_in_one.png filter=lfs d extensions/addtional/models/lora/README.md filter=lfs diff=lfs merge=lfs -text repositories/BLIP/BLIP.gif filter=lfs diff=lfs merge=lfs -text repositories/generative-models/assets/sdxl_report.pdf filter=lfs diff=lfs merge=lfs -text +repositories/stable-diffusion-stability-ai/assets/stable-inpainting/merged-leopards.png filter=lfs diff=lfs merge=lfs -text +repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/d2i.gif filter=lfs diff=lfs merge=lfs -text +repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/depth2img01.png filter=lfs diff=lfs merge=lfs -text +repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/depth2img02.png filter=lfs diff=lfs merge=lfs -text +repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/merged-0000.png filter=lfs diff=lfs merge=lfs -text +repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/merged-0004.png filter=lfs diff=lfs merge=lfs -text +repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/merged-0005.png filter=lfs diff=lfs merge=lfs -text +repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/upscaling-in.png filter=lfs diff=lfs merge=lfs -text +repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/upscaling-out.png filter=lfs diff=lfs merge=lfs -text diff --git a/repositories/k-diffusion/k_diffusion/__pycache__/utils.cpython-310.pyc b/repositories/k-diffusion/k_diffusion/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7e8b5d6e6fb008aa5b94ae7d056e8410177fa31a Binary files /dev/null and b/repositories/k-diffusion/k_diffusion/__pycache__/utils.cpython-310.pyc differ diff --git a/repositories/k-diffusion/k_diffusion/augmentation.py b/repositories/k-diffusion/k_diffusion/augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..7dd17c686300c8ecba7fac134aa54f01619c3d46 --- /dev/null +++ b/repositories/k-diffusion/k_diffusion/augmentation.py @@ -0,0 +1,105 @@ +from functools import reduce +import math +import operator + +import numpy as np +from skimage import transform +import torch +from torch import nn + + +def translate2d(tx, ty): + mat = [[1, 0, tx], + [0, 1, ty], + [0, 0, 1]] + return torch.tensor(mat, dtype=torch.float32) + + +def scale2d(sx, sy): + mat = [[sx, 0, 0], + [ 0, sy, 0], + [ 0, 0, 1]] + return torch.tensor(mat, dtype=torch.float32) + + +def rotate2d(theta): + mat = [[torch.cos(theta), torch.sin(-theta), 0], + [torch.sin(theta), torch.cos(theta), 0], + [ 0, 0, 1]] + return torch.tensor(mat, dtype=torch.float32) + + +class KarrasAugmentationPipeline: + def __init__(self, a_prob=0.12, a_scale=2**0.2, a_aniso=2**0.2, a_trans=1/8): + self.a_prob = a_prob + self.a_scale = a_scale + self.a_aniso = a_aniso + self.a_trans = a_trans + + def __call__(self, image): + h, w = image.size + mats = [translate2d(h / 2 - 0.5, w / 2 - 0.5)] + + # x-flip + a0 = torch.randint(2, []).float() + mats.append(scale2d(1 - 2 * a0, 1)) + # y-flip + do = (torch.rand([]) < self.a_prob).float() + a1 = torch.randint(2, []).float() * do + mats.append(scale2d(1, 1 - 2 * a1)) + # scaling + do = (torch.rand([]) < self.a_prob).float() + a2 = torch.randn([]) * do + mats.append(scale2d(self.a_scale ** a2, self.a_scale ** a2)) + # rotation + do = (torch.rand([]) < self.a_prob).float() + a3 = (torch.rand([]) * 2 * math.pi - math.pi) * do + mats.append(rotate2d(-a3)) + # anisotropy + do = (torch.rand([]) < self.a_prob).float() + a4 = (torch.rand([]) * 2 * math.pi - math.pi) * do + a5 = torch.randn([]) * do + mats.append(rotate2d(a4)) + mats.append(scale2d(self.a_aniso ** a5, self.a_aniso ** -a5)) + mats.append(rotate2d(-a4)) + # translation + do = (torch.rand([]) < self.a_prob).float() + a6 = torch.randn([]) * do + a7 = torch.randn([]) * do + mats.append(translate2d(self.a_trans * w * a6, self.a_trans * h * a7)) + + # form the transformation matrix and conditioning vector + mats.append(translate2d(-h / 2 + 0.5, -w / 2 + 0.5)) + mat = reduce(operator.matmul, mats) + cond = torch.stack([a0, a1, a2, a3.cos() - 1, a3.sin(), a5 * a4.cos(), a5 * a4.sin(), a6, a7]) + + # apply the transformation + image_orig = np.array(image, dtype=np.float32) / 255 + if image_orig.ndim == 2: + image_orig = image_orig[..., None] + tf = transform.AffineTransform(mat.numpy()) + image = transform.warp(image_orig, tf.inverse, order=3, mode='reflect', cval=0.5, clip=False, preserve_range=True) + image_orig = torch.as_tensor(image_orig).movedim(2, 0) * 2 - 1 + image = torch.as_tensor(image).movedim(2, 0) * 2 - 1 + return image, image_orig, cond + + +class KarrasAugmentWrapper(nn.Module): + def __init__(self, model): + super().__init__() + self.inner_model = model + + def forward(self, input, sigma, aug_cond=None, mapping_cond=None, **kwargs): + if aug_cond is None: + aug_cond = input.new_zeros([input.shape[0], 9]) + if mapping_cond is None: + mapping_cond = aug_cond + else: + mapping_cond = torch.cat([aug_cond, mapping_cond], dim=1) + return self.inner_model(input, sigma, mapping_cond=mapping_cond, **kwargs) + + def set_skip_stages(self, skip_stages): + return self.inner_model.set_skip_stages(skip_stages) + + def set_patch_size(self, patch_size): + return self.inner_model.set_patch_size(patch_size) diff --git a/repositories/k-diffusion/k_diffusion/config.py b/repositories/k-diffusion/k_diffusion/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f9de7bc203216b0a4e26a6d18c913fedc84dbe46 --- /dev/null +++ b/repositories/k-diffusion/k_diffusion/config.py @@ -0,0 +1,115 @@ +from functools import partial +import json +import math +import warnings + +from jsonmerge import merge + +from . import augmentation, layers, models, utils + + +def load_config(file): + defaults = { + 'model': { + 'sigma_data': 1., + 'patch_size': 1, + 'dropout_rate': 0., + 'augment_wrapper': True, + 'augment_prob': 0., + 'mapping_cond_dim': 0, + 'unet_cond_dim': 0, + 'cross_cond_dim': 0, + 'cross_attn_depths': None, + 'skip_stages': 0, + 'has_variance': False, + 'loss_config': 'karras', + }, + 'dataset': { + 'type': 'imagefolder', + }, + 'optimizer': { + 'type': 'adamw', + 'lr': 1e-4, + 'betas': [0.95, 0.999], + 'eps': 1e-6, + 'weight_decay': 1e-3, + }, + 'lr_sched': { + 'type': 'constant', + }, + 'ema_sched': { + 'type': 'inverse', + 'power': 0.6667, + 'max_value': 0.9999 + }, + } + config = json.load(file) + return merge(defaults, config) + + +def make_model(config): + config = config['model'] + assert config['type'] == 'image_v1' + model = models.ImageDenoiserModelV1( + config['input_channels'], + config['mapping_out'], + config['depths'], + config['channels'], + config['self_attn_depths'], + config['cross_attn_depths'], + patch_size=config['patch_size'], + dropout_rate=config['dropout_rate'], + mapping_cond_dim=config['mapping_cond_dim'] + (9 if config['augment_wrapper'] else 0), + unet_cond_dim=config['unet_cond_dim'], + cross_cond_dim=config['cross_cond_dim'], + skip_stages=config['skip_stages'], + has_variance=config['has_variance'], + ) + if config['augment_wrapper']: + model = augmentation.KarrasAugmentWrapper(model) + return model + + +def make_denoiser_wrapper(config): + config = config['model'] + sigma_data = config.get('sigma_data', 1.) + has_variance = config.get('has_variance', False) + loss_config = config.get('loss_config', 'karras') + if loss_config == 'karras': + if not has_variance: + return partial(layers.Denoiser, sigma_data=sigma_data) + return partial(layers.DenoiserWithVariance, sigma_data=sigma_data) + if loss_config == 'simple': + if has_variance: + raise ValueError('Simple loss config does not support a variance output') + return partial(layers.SimpleLossDenoiser, sigma_data=sigma_data) + raise ValueError('Unknown loss config type') + + +def make_sample_density(config): + sd_config = config['sigma_sample_density'] + sigma_data = config['sigma_data'] + if sd_config['type'] == 'lognormal': + loc = sd_config['mean'] if 'mean' in sd_config else sd_config['loc'] + scale = sd_config['std'] if 'std' in sd_config else sd_config['scale'] + return partial(utils.rand_log_normal, loc=loc, scale=scale) + if sd_config['type'] == 'loglogistic': + loc = sd_config['loc'] if 'loc' in sd_config else math.log(sigma_data) + scale = sd_config['scale'] if 'scale' in sd_config else 0.5 + min_value = sd_config['min_value'] if 'min_value' in sd_config else 0. + max_value = sd_config['max_value'] if 'max_value' in sd_config else float('inf') + return partial(utils.rand_log_logistic, loc=loc, scale=scale, min_value=min_value, max_value=max_value) + if sd_config['type'] == 'loguniform': + min_value = sd_config['min_value'] if 'min_value' in sd_config else config['sigma_min'] + max_value = sd_config['max_value'] if 'max_value' in sd_config else config['sigma_max'] + return partial(utils.rand_log_uniform, min_value=min_value, max_value=max_value) + if sd_config['type'] in {'v-diffusion', 'cosine'}: + min_value = sd_config['min_value'] if 'min_value' in sd_config else 1e-3 + max_value = sd_config['max_value'] if 'max_value' in sd_config else 1e3 + return partial(utils.rand_v_diffusion, sigma_data=sigma_data, min_value=min_value, max_value=max_value) + if sd_config['type'] == 'split-lognormal': + loc = sd_config['mean'] if 'mean' in sd_config else sd_config['loc'] + scale_1 = sd_config['std_1'] if 'std_1' in sd_config else sd_config['scale_1'] + scale_2 = sd_config['std_2'] if 'std_2' in sd_config else sd_config['scale_2'] + return partial(utils.rand_split_log_normal, loc=loc, scale_1=scale_1, scale_2=scale_2) + raise ValueError('Unknown sample density type') diff --git a/repositories/k-diffusion/k_diffusion/evaluation.py b/repositories/k-diffusion/k_diffusion/evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..2c34bbf1656854d9cf233b7620b684e44b30de82 --- /dev/null +++ b/repositories/k-diffusion/k_diffusion/evaluation.py @@ -0,0 +1,134 @@ +import math +import os +from pathlib import Path + +from cleanfid.inception_torchscript import InceptionV3W +import clip +from resize_right import resize +import torch +from torch import nn +from torch.nn import functional as F +from torchvision import transforms +from tqdm.auto import trange + +from . import utils + + +class InceptionV3FeatureExtractor(nn.Module): + def __init__(self, device='cpu'): + super().__init__() + path = Path(os.environ.get('XDG_CACHE_HOME', Path.home() / '.cache')) / 'k-diffusion' + url = 'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/inception-2015-12-05.pt' + digest = 'f58cb9b6ec323ed63459aa4fb441fe750cfe39fafad6da5cb504a16f19e958f4' + utils.download_file(path / 'inception-2015-12-05.pt', url, digest) + self.model = InceptionV3W(str(path), resize_inside=False).to(device) + self.size = (299, 299) + + def forward(self, x): + if x.shape[2:4] != self.size: + x = resize(x, out_shape=self.size, pad_mode='reflect') + if x.shape[1] == 1: + x = torch.cat([x] * 3, dim=1) + x = (x * 127.5 + 127.5).clamp(0, 255) + return self.model(x) + + +class CLIPFeatureExtractor(nn.Module): + def __init__(self, name='ViT-L/14@336px', device='cpu'): + super().__init__() + self.model = clip.load(name, device=device)[0].eval().requires_grad_(False) + self.normalize = transforms.Normalize(mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)) + self.size = (self.model.visual.input_resolution, self.model.visual.input_resolution) + + def forward(self, x): + if x.shape[2:4] != self.size: + x = resize(x.add(1).div(2), out_shape=self.size, pad_mode='reflect').clamp(0, 1) + x = self.normalize(x) + x = self.model.encode_image(x).float() + x = F.normalize(x) * x.shape[1] ** 0.5 + return x + + +def compute_features(accelerator, sample_fn, extractor_fn, n, batch_size): + n_per_proc = math.ceil(n / accelerator.num_processes) + feats_all = [] + try: + for i in trange(0, n_per_proc, batch_size, disable=not accelerator.is_main_process): + cur_batch_size = min(n - i, batch_size) + samples = sample_fn(cur_batch_size)[:cur_batch_size] + feats_all.append(accelerator.gather(extractor_fn(samples))) + except StopIteration: + pass + return torch.cat(feats_all)[:n] + + +def polynomial_kernel(x, y): + d = x.shape[-1] + dot = x @ y.transpose(-2, -1) + return (dot / d + 1) ** 3 + + +def squared_mmd(x, y, kernel=polynomial_kernel): + m = x.shape[-2] + n = y.shape[-2] + kxx = kernel(x, x) + kyy = kernel(y, y) + kxy = kernel(x, y) + kxx_sum = kxx.sum([-1, -2]) - kxx.diagonal(dim1=-1, dim2=-2).sum(-1) + kyy_sum = kyy.sum([-1, -2]) - kyy.diagonal(dim1=-1, dim2=-2).sum(-1) + kxy_sum = kxy.sum([-1, -2]) + term_1 = kxx_sum / m / (m - 1) + term_2 = kyy_sum / n / (n - 1) + term_3 = kxy_sum * 2 / m / n + return term_1 + term_2 - term_3 + + +@utils.tf32_mode(matmul=False) +def kid(x, y, max_size=5000): + x_size, y_size = x.shape[0], y.shape[0] + n_partitions = math.ceil(max(x_size / max_size, y_size / max_size)) + total_mmd = x.new_zeros([]) + for i in range(n_partitions): + cur_x = x[round(i * x_size / n_partitions):round((i + 1) * x_size / n_partitions)] + cur_y = y[round(i * y_size / n_partitions):round((i + 1) * y_size / n_partitions)] + total_mmd = total_mmd + squared_mmd(cur_x, cur_y) + return total_mmd / n_partitions + + +class _MatrixSquareRootEig(torch.autograd.Function): + @staticmethod + def forward(ctx, a): + vals, vecs = torch.linalg.eigh(a) + ctx.save_for_backward(vals, vecs) + return vecs @ vals.abs().sqrt().diag_embed() @ vecs.transpose(-2, -1) + + @staticmethod + def backward(ctx, grad_output): + vals, vecs = ctx.saved_tensors + d = vals.abs().sqrt().unsqueeze(-1).repeat_interleave(vals.shape[-1], -1) + vecs_t = vecs.transpose(-2, -1) + return vecs @ (vecs_t @ grad_output @ vecs / (d + d.transpose(-2, -1))) @ vecs_t + + +def sqrtm_eig(a): + if a.ndim < 2: + raise RuntimeError('tensor of matrices must have at least 2 dimensions') + if a.shape[-2] != a.shape[-1]: + raise RuntimeError('tensor must be batches of square matrices') + return _MatrixSquareRootEig.apply(a) + + +@utils.tf32_mode(matmul=False) +def fid(x, y, eps=1e-8): + x_mean = x.mean(dim=0) + y_mean = y.mean(dim=0) + mean_term = (x_mean - y_mean).pow(2).sum() + x_cov = torch.cov(x.T) + y_cov = torch.cov(y.T) + eps_eye = torch.eye(x_cov.shape[0], device=x_cov.device, dtype=x_cov.dtype) * eps + x_cov = x_cov + eps_eye + y_cov = y_cov + eps_eye + x_cov_sqrt = sqrtm_eig(x_cov) + cov_term = torch.trace(x_cov + y_cov - 2 * sqrtm_eig(x_cov_sqrt @ y_cov @ x_cov_sqrt)) + return mean_term + cov_term diff --git a/repositories/k-diffusion/k_diffusion/external.py b/repositories/k-diffusion/k_diffusion/external.py new file mode 100644 index 0000000000000000000000000000000000000000..79b51cec41c52f054775f26c26cf63414d588aef --- /dev/null +++ b/repositories/k-diffusion/k_diffusion/external.py @@ -0,0 +1,177 @@ +import math + +import torch +from torch import nn + +from . import sampling, utils + + +class VDenoiser(nn.Module): + """A v-diffusion-pytorch model wrapper for k-diffusion.""" + + def __init__(self, inner_model): + super().__init__() + self.inner_model = inner_model + self.sigma_data = 1. + + def get_scalings(self, sigma): + c_skip = self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) + c_out = -sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5 + c_in = 1 / (sigma ** 2 + self.sigma_data ** 2) ** 0.5 + return c_skip, c_out, c_in + + def sigma_to_t(self, sigma): + return sigma.atan() / math.pi * 2 + + def t_to_sigma(self, t): + return (t * math.pi / 2).tan() + + def loss(self, input, noise, sigma, **kwargs): + c_skip, c_out, c_in = [utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)] + noised_input = input + noise * utils.append_dims(sigma, input.ndim) + model_output = self.inner_model(noised_input * c_in, self.sigma_to_t(sigma), **kwargs) + target = (input - c_skip * noised_input) / c_out + return (model_output - target).pow(2).flatten(1).mean(1) + + def forward(self, input, sigma, **kwargs): + c_skip, c_out, c_in = [utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)] + return self.inner_model(input * c_in, self.sigma_to_t(sigma), **kwargs) * c_out + input * c_skip + + +class DiscreteSchedule(nn.Module): + """A mapping between continuous noise levels (sigmas) and a list of discrete noise + levels.""" + + def __init__(self, sigmas, quantize): + super().__init__() + self.register_buffer('sigmas', sigmas) + self.register_buffer('log_sigmas', sigmas.log()) + self.quantize = quantize + + @property + def sigma_min(self): + return self.sigmas[0] + + @property + def sigma_max(self): + return self.sigmas[-1] + + def get_sigmas(self, n=None): + if n is None: + return sampling.append_zero(self.sigmas.flip(0)) + t_max = len(self.sigmas) - 1 + t = torch.linspace(t_max, 0, n, device=self.sigmas.device) + return sampling.append_zero(self.t_to_sigma(t)) + + def sigma_to_t(self, sigma, quantize=None): + quantize = self.quantize if quantize is None else quantize + log_sigma = sigma.log() + dists = log_sigma - self.log_sigmas[:, None] + if quantize: + return dists.abs().argmin(dim=0).view(sigma.shape) + low_idx = dists.ge(0).cumsum(dim=0).argmax(dim=0).clamp(max=self.log_sigmas.shape[0] - 2) + high_idx = low_idx + 1 + low, high = self.log_sigmas[low_idx], self.log_sigmas[high_idx] + w = (low - log_sigma) / (low - high) + w = w.clamp(0, 1) + t = (1 - w) * low_idx + w * high_idx + return t.view(sigma.shape) + + def t_to_sigma(self, t): + t = t.float() + low_idx, high_idx, w = t.floor().long(), t.ceil().long(), t.frac() + log_sigma = (1 - w) * self.log_sigmas[low_idx] + w * self.log_sigmas[high_idx] + return log_sigma.exp() + + +class DiscreteEpsDDPMDenoiser(DiscreteSchedule): + """A wrapper for discrete schedule DDPM models that output eps (the predicted + noise).""" + + def __init__(self, model, alphas_cumprod, quantize): + super().__init__(((1 - alphas_cumprod) / alphas_cumprod) ** 0.5, quantize) + self.inner_model = model + self.sigma_data = 1. + + def get_scalings(self, sigma): + c_out = -sigma + c_in = 1 / (sigma ** 2 + self.sigma_data ** 2) ** 0.5 + return c_out, c_in + + def get_eps(self, *args, **kwargs): + return self.inner_model(*args, **kwargs) + + def loss(self, input, noise, sigma, **kwargs): + c_out, c_in = [utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)] + noised_input = input + noise * utils.append_dims(sigma, input.ndim) + eps = self.get_eps(noised_input * c_in, self.sigma_to_t(sigma), **kwargs) + return (eps - noise).pow(2).flatten(1).mean(1) + + def forward(self, input, sigma, **kwargs): + c_out, c_in = [utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)] + eps = self.get_eps(input * c_in, self.sigma_to_t(sigma), **kwargs) + return input + eps * c_out + + +class OpenAIDenoiser(DiscreteEpsDDPMDenoiser): + """A wrapper for OpenAI diffusion models.""" + + def __init__(self, model, diffusion, quantize=False, has_learned_sigmas=True, device='cpu'): + alphas_cumprod = torch.tensor(diffusion.alphas_cumprod, device=device, dtype=torch.float32) + super().__init__(model, alphas_cumprod, quantize=quantize) + self.has_learned_sigmas = has_learned_sigmas + + def get_eps(self, *args, **kwargs): + model_output = self.inner_model(*args, **kwargs) + if self.has_learned_sigmas: + return model_output.chunk(2, dim=1)[0] + return model_output + + +class CompVisDenoiser(DiscreteEpsDDPMDenoiser): + """A wrapper for CompVis diffusion models.""" + + def __init__(self, model, quantize=False, device='cpu'): + super().__init__(model, model.alphas_cumprod, quantize=quantize) + + def get_eps(self, *args, **kwargs): + return self.inner_model.apply_model(*args, **kwargs) + + +class DiscreteVDDPMDenoiser(DiscreteSchedule): + """A wrapper for discrete schedule DDPM models that output v.""" + + def __init__(self, model, alphas_cumprod, quantize): + super().__init__(((1 - alphas_cumprod) / alphas_cumprod) ** 0.5, quantize) + self.inner_model = model + self.sigma_data = 1. + + def get_scalings(self, sigma): + c_skip = self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) + c_out = -sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5 + c_in = 1 / (sigma ** 2 + self.sigma_data ** 2) ** 0.5 + return c_skip, c_out, c_in + + def get_v(self, *args, **kwargs): + return self.inner_model(*args, **kwargs) + + def loss(self, input, noise, sigma, **kwargs): + c_skip, c_out, c_in = [utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)] + noised_input = input + noise * utils.append_dims(sigma, input.ndim) + model_output = self.get_v(noised_input * c_in, self.sigma_to_t(sigma), **kwargs) + target = (input - c_skip * noised_input) / c_out + return (model_output - target).pow(2).flatten(1).mean(1) + + def forward(self, input, sigma, **kwargs): + c_skip, c_out, c_in = [utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)] + return self.get_v(input * c_in, self.sigma_to_t(sigma), **kwargs) * c_out + input * c_skip + + +class CompVisVDenoiser(DiscreteVDDPMDenoiser): + """A wrapper for CompVis diffusion models that output v.""" + + def __init__(self, model, quantize=False, device='cpu'): + super().__init__(model, model.alphas_cumprod, quantize=quantize) + + def get_v(self, x, t, cond, **kwargs): + return self.inner_model.apply_model(x, t, cond) diff --git a/repositories/k-diffusion/k_diffusion/gns.py b/repositories/k-diffusion/k_diffusion/gns.py new file mode 100644 index 0000000000000000000000000000000000000000..dcb7b8d8a9aeae38a7f961c63f66cca4ef90a9e7 --- /dev/null +++ b/repositories/k-diffusion/k_diffusion/gns.py @@ -0,0 +1,99 @@ +import torch +from torch import nn + + +class DDPGradientStatsHook: + def __init__(self, ddp_module): + try: + ddp_module.register_comm_hook(self, self._hook_fn) + except AttributeError: + raise ValueError('DDPGradientStatsHook does not support non-DDP wrapped modules') + self._clear_state() + + def _clear_state(self): + self.bucket_sq_norms_small_batch = [] + self.bucket_sq_norms_large_batch = [] + + @staticmethod + def _hook_fn(self, bucket): + buf = bucket.buffer() + self.bucket_sq_norms_small_batch.append(buf.pow(2).sum()) + fut = torch.distributed.all_reduce(buf, op=torch.distributed.ReduceOp.AVG, async_op=True).get_future() + def callback(fut): + buf = fut.value()[0] + self.bucket_sq_norms_large_batch.append(buf.pow(2).sum()) + return buf + return fut.then(callback) + + def get_stats(self): + sq_norm_small_batch = sum(self.bucket_sq_norms_small_batch) + sq_norm_large_batch = sum(self.bucket_sq_norms_large_batch) + self._clear_state() + stats = torch.stack([sq_norm_small_batch, sq_norm_large_batch]) + torch.distributed.all_reduce(stats, op=torch.distributed.ReduceOp.AVG) + return stats[0].item(), stats[1].item() + + +class GradientNoiseScale: + """Calculates the gradient noise scale (1 / SNR), or critical batch size, + from _An Empirical Model of Large-Batch Training_, + https://arxiv.org/abs/1812.06162). + + Args: + beta (float): The decay factor for the exponential moving averages used to + calculate the gradient noise scale. + Default: 0.9998 + eps (float): Added for numerical stability. + Default: 1e-8 + """ + + def __init__(self, beta=0.9998, eps=1e-8): + self.beta = beta + self.eps = eps + self.ema_sq_norm = 0. + self.ema_var = 0. + self.beta_cumprod = 1. + self.gradient_noise_scale = float('nan') + + def state_dict(self): + """Returns the state of the object as a :class:`dict`.""" + return dict(self.__dict__.items()) + + def load_state_dict(self, state_dict): + """Loads the object's state. + Args: + state_dict (dict): object state. Should be an object returned + from a call to :meth:`state_dict`. + """ + self.__dict__.update(state_dict) + + def update(self, sq_norm_small_batch, sq_norm_large_batch, n_small_batch, n_large_batch): + """Updates the state with a new batch's gradient statistics, and returns the + current gradient noise scale. + + Args: + sq_norm_small_batch (float): The mean of the squared 2-norms of microbatch or + per sample gradients. + sq_norm_large_batch (float): The squared 2-norm of the mean of the microbatch or + per sample gradients. + n_small_batch (int): The batch size of the individual microbatch or per sample + gradients (1 if per sample). + n_large_batch (int): The total batch size of the mean of the microbatch or + per sample gradients. + """ + est_sq_norm = (n_large_batch * sq_norm_large_batch - n_small_batch * sq_norm_small_batch) / (n_large_batch - n_small_batch) + est_var = (sq_norm_small_batch - sq_norm_large_batch) / (1 / n_small_batch - 1 / n_large_batch) + self.ema_sq_norm = self.beta * self.ema_sq_norm + (1 - self.beta) * est_sq_norm + self.ema_var = self.beta * self.ema_var + (1 - self.beta) * est_var + self.beta_cumprod *= self.beta + self.gradient_noise_scale = max(self.ema_var, self.eps) / max(self.ema_sq_norm, self.eps) + return self.gradient_noise_scale + + def get_gns(self): + """Returns the current gradient noise scale.""" + return self.gradient_noise_scale + + def get_stats(self): + """Returns the current (debiased) estimates of the squared mean gradient + and gradient variance.""" + return self.ema_sq_norm / (1 - self.beta_cumprod), self.ema_var / (1 - self.beta_cumprod) diff --git a/repositories/k-diffusion/k_diffusion/layers.py b/repositories/k-diffusion/k_diffusion/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..aa647bd3c1e0bef91e475f2376b4a79f6bb0823d --- /dev/null +++ b/repositories/k-diffusion/k_diffusion/layers.py @@ -0,0 +1,256 @@ +import math + +from einops import rearrange, repeat +import torch +from torch import nn +from torch.nn import functional as F + +from . import sampling, utils + +# Karras et al. preconditioned denoiser + +class Denoiser(nn.Module): + """A Karras et al. preconditioner for denoising diffusion models.""" + + def __init__(self, inner_model, sigma_data=1.): + super().__init__() + self.inner_model = inner_model + self.sigma_data = sigma_data + + def get_scalings(self, sigma): + c_skip = self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) + c_out = sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5 + c_in = 1 / (sigma ** 2 + self.sigma_data ** 2) ** 0.5 + return c_skip, c_out, c_in + + def loss(self, input, noise, sigma, **kwargs): + c_skip, c_out, c_in = [utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)] + noised_input = input + noise * utils.append_dims(sigma, input.ndim) + model_output = self.inner_model(noised_input * c_in, sigma, **kwargs) + target = (input - c_skip * noised_input) / c_out + return (model_output - target).pow(2).flatten(1).mean(1) + + def forward(self, input, sigma, **kwargs): + c_skip, c_out, c_in = [utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)] + return self.inner_model(input * c_in, sigma, **kwargs) * c_out + input * c_skip + + +class DenoiserWithVariance(Denoiser): + def loss(self, input, noise, sigma, **kwargs): + c_skip, c_out, c_in = [utils.append_dims(x, input.ndim) for x in self.get_scalings(sigma)] + noised_input = input + noise * utils.append_dims(sigma, input.ndim) + model_output, logvar = self.inner_model(noised_input * c_in, sigma, return_variance=True, **kwargs) + logvar = utils.append_dims(logvar, model_output.ndim) + target = (input - c_skip * noised_input) / c_out + losses = ((model_output - target) ** 2 / logvar.exp() + logvar) / 2 + return losses.flatten(1).mean(1) + + +class SimpleLossDenoiser(Denoiser): + """L_simple with the Karras et al. preconditioner.""" + + def loss(self, input, noise, sigma, **kwargs): + noised_input = input + noise * utils.append_dims(sigma, input.ndim) + denoised = self(noised_input, sigma, **kwargs) + eps = sampling.to_d(noised_input, sigma, denoised) + return (eps - noise).pow(2).flatten(1).mean(1) + + +# Residual blocks + +class ResidualBlock(nn.Module): + def __init__(self, *main, skip=None): + super().__init__() + self.main = nn.Sequential(*main) + self.skip = skip if skip else nn.Identity() + + def forward(self, input): + return self.main(input) + self.skip(input) + + +# Noise level (and other) conditioning + +class ConditionedModule(nn.Module): + pass + + +class UnconditionedModule(ConditionedModule): + def __init__(self, module): + super().__init__() + self.module = module + + def forward(self, input, cond=None): + return self.module(input) + + +class ConditionedSequential(nn.Sequential, ConditionedModule): + def forward(self, input, cond): + for module in self: + if isinstance(module, ConditionedModule): + input = module(input, cond) + else: + input = module(input) + return input + + +class ConditionedResidualBlock(ConditionedModule): + def __init__(self, *main, skip=None): + super().__init__() + self.main = ConditionedSequential(*main) + self.skip = skip if skip else nn.Identity() + + def forward(self, input, cond): + skip = self.skip(input, cond) if isinstance(self.skip, ConditionedModule) else self.skip(input) + return self.main(input, cond) + skip + + +class AdaGN(ConditionedModule): + def __init__(self, feats_in, c_out, num_groups, eps=1e-5, cond_key='cond'): + super().__init__() + self.num_groups = num_groups + self.eps = eps + self.cond_key = cond_key + self.mapper = nn.Linear(feats_in, c_out * 2) + + def forward(self, input, cond): + weight, bias = self.mapper(cond[self.cond_key]).chunk(2, dim=-1) + input = F.group_norm(input, self.num_groups, eps=self.eps) + return torch.addcmul(utils.append_dims(bias, input.ndim), input, utils.append_dims(weight, input.ndim) + 1) + + +# Attention + +class SelfAttention2d(ConditionedModule): + def __init__(self, c_in, n_head, norm, dropout_rate=0.): + super().__init__() + assert c_in % n_head == 0 + self.norm_in = norm(c_in) + self.n_head = n_head + self.qkv_proj = nn.Conv2d(c_in, c_in * 3, 1) + self.out_proj = nn.Conv2d(c_in, c_in, 1) + self.dropout = nn.Dropout(dropout_rate) + + def forward(self, input, cond): + n, c, h, w = input.shape + qkv = self.qkv_proj(self.norm_in(input, cond)) + qkv = qkv.view([n, self.n_head * 3, c // self.n_head, h * w]).transpose(2, 3) + q, k, v = qkv.chunk(3, dim=1) + scale = k.shape[3] ** -0.25 + att = ((q * scale) @ (k.transpose(2, 3) * scale)).softmax(3) + att = self.dropout(att) + y = (att @ v).transpose(2, 3).contiguous().view([n, c, h, w]) + return input + self.out_proj(y) + + +class CrossAttention2d(ConditionedModule): + def __init__(self, c_dec, c_enc, n_head, norm_dec, dropout_rate=0., + cond_key='cross', cond_key_padding='cross_padding'): + super().__init__() + assert c_dec % n_head == 0 + self.cond_key = cond_key + self.cond_key_padding = cond_key_padding + self.norm_enc = nn.LayerNorm(c_enc) + self.norm_dec = norm_dec(c_dec) + self.n_head = n_head + self.q_proj = nn.Conv2d(c_dec, c_dec, 1) + self.kv_proj = nn.Linear(c_enc, c_dec * 2) + self.out_proj = nn.Conv2d(c_dec, c_dec, 1) + self.dropout = nn.Dropout(dropout_rate) + + def forward(self, input, cond): + n, c, h, w = input.shape + q = self.q_proj(self.norm_dec(input, cond)) + q = q.view([n, self.n_head, c // self.n_head, h * w]).transpose(2, 3) + kv = self.kv_proj(self.norm_enc(cond[self.cond_key])) + kv = kv.view([n, -1, self.n_head * 2, c // self.n_head]).transpose(1, 2) + k, v = kv.chunk(2, dim=1) + scale = k.shape[3] ** -0.25 + att = ((q * scale) @ (k.transpose(2, 3) * scale)) + att = att - (cond[self.cond_key_padding][:, None, None, :]) * 10000 + att = att.softmax(3) + att = self.dropout(att) + y = (att @ v).transpose(2, 3) + y = y.contiguous().view([n, c, h, w]) + return input + self.out_proj(y) + + +# Downsampling/upsampling + +_kernels = { + 'linear': + [1 / 8, 3 / 8, 3 / 8, 1 / 8], + 'cubic': + [-0.01171875, -0.03515625, 0.11328125, 0.43359375, + 0.43359375, 0.11328125, -0.03515625, -0.01171875], + 'lanczos3': + [0.003689131001010537, 0.015056144446134567, -0.03399861603975296, + -0.066637322306633, 0.13550527393817902, 0.44638532400131226, + 0.44638532400131226, 0.13550527393817902, -0.066637322306633, + -0.03399861603975296, 0.015056144446134567, 0.003689131001010537] +} +_kernels['bilinear'] = _kernels['linear'] +_kernels['bicubic'] = _kernels['cubic'] + + +class Downsample2d(nn.Module): + def __init__(self, kernel='linear', pad_mode='reflect'): + super().__init__() + self.pad_mode = pad_mode + kernel_1d = torch.tensor([_kernels[kernel]]) + self.pad = kernel_1d.shape[1] // 2 - 1 + self.register_buffer('kernel', kernel_1d.T @ kernel_1d) + + def forward(self, x): + x = F.pad(x, (self.pad,) * 4, self.pad_mode) + weight = x.new_zeros([x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]]) + indices = torch.arange(x.shape[1], device=x.device) + weight[indices, indices] = self.kernel.to(weight) + return F.conv2d(x, weight, stride=2) + + +class Upsample2d(nn.Module): + def __init__(self, kernel='linear', pad_mode='reflect'): + super().__init__() + self.pad_mode = pad_mode + kernel_1d = torch.tensor([_kernels[kernel]]) * 2 + self.pad = kernel_1d.shape[1] // 2 - 1 + self.register_buffer('kernel', kernel_1d.T @ kernel_1d) + + def forward(self, x): + x = F.pad(x, ((self.pad + 1) // 2,) * 4, self.pad_mode) + weight = x.new_zeros([x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]]) + indices = torch.arange(x.shape[1], device=x.device) + weight[indices, indices] = self.kernel.to(weight) + return F.conv_transpose2d(x, weight, stride=2, padding=self.pad * 2 + 1) + + +# Embeddings + +class FourierFeatures(nn.Module): + def __init__(self, in_features, out_features, std=1.): + super().__init__() + assert out_features % 2 == 0 + self.register_buffer('weight', torch.randn([out_features // 2, in_features]) * std) + + def forward(self, input): + f = 2 * math.pi * input @ self.weight.T + return torch.cat([f.cos(), f.sin()], dim=-1) + + +# U-Nets + +class UNet(ConditionedModule): + def __init__(self, d_blocks, u_blocks, skip_stages=0): + super().__init__() + self.d_blocks = nn.ModuleList(d_blocks) + self.u_blocks = nn.ModuleList(u_blocks) + self.skip_stages = skip_stages + + def forward(self, input, cond): + skips = [] + for block in self.d_blocks[self.skip_stages:]: + input = block(input, cond) + skips.append(input) + for i, (block, skip) in enumerate(zip(self.u_blocks, reversed(skips))): + input = block(input, cond, skip if i > 0 else None) + return input diff --git a/repositories/k-diffusion/k_diffusion/models/__init__.py b/repositories/k-diffusion/k_diffusion/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..82608ff1de6137b31eeaf8de6814df6a7e35606a --- /dev/null +++ b/repositories/k-diffusion/k_diffusion/models/__init__.py @@ -0,0 +1 @@ +from .image_v1 import ImageDenoiserModelV1 diff --git a/repositories/k-diffusion/k_diffusion/models/__pycache__/__init__.cpython-310.pyc b/repositories/k-diffusion/k_diffusion/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c44f350597cdcac9e0569bfbe101152d614badd5 Binary files /dev/null and b/repositories/k-diffusion/k_diffusion/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/repositories/k-diffusion/k_diffusion/models/__pycache__/image_v1.cpython-310.pyc b/repositories/k-diffusion/k_diffusion/models/__pycache__/image_v1.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f3601f9a7ffa017ebf1bb7a57ad6f77b9277bdb Binary files /dev/null and b/repositories/k-diffusion/k_diffusion/models/__pycache__/image_v1.cpython-310.pyc differ diff --git a/repositories/k-diffusion/k_diffusion/models/image_v1.py b/repositories/k-diffusion/k_diffusion/models/image_v1.py new file mode 100644 index 0000000000000000000000000000000000000000..9ffd5f2c4d6c9d086107d5fac67452419696c723 --- /dev/null +++ b/repositories/k-diffusion/k_diffusion/models/image_v1.py @@ -0,0 +1,156 @@ +import math + +import torch +from torch import nn +from torch.nn import functional as F + +from .. import layers, utils + + +def orthogonal_(module): + nn.init.orthogonal_(module.weight) + return module + + +class ResConvBlock(layers.ConditionedResidualBlock): + def __init__(self, feats_in, c_in, c_mid, c_out, group_size=32, dropout_rate=0.): + skip = None if c_in == c_out else orthogonal_(nn.Conv2d(c_in, c_out, 1, bias=False)) + super().__init__( + layers.AdaGN(feats_in, c_in, max(1, c_in // group_size)), + nn.GELU(), + nn.Conv2d(c_in, c_mid, 3, padding=1), + nn.Dropout2d(dropout_rate, inplace=True), + layers.AdaGN(feats_in, c_mid, max(1, c_mid // group_size)), + nn.GELU(), + nn.Conv2d(c_mid, c_out, 3, padding=1), + nn.Dropout2d(dropout_rate, inplace=True), + skip=skip) + + +class DBlock(layers.ConditionedSequential): + def __init__(self, n_layers, feats_in, c_in, c_mid, c_out, group_size=32, head_size=64, dropout_rate=0., downsample=False, self_attn=False, cross_attn=False, c_enc=0): + modules = [nn.Identity()] + for i in range(n_layers): + my_c_in = c_in if i == 0 else c_mid + my_c_out = c_mid if i < n_layers - 1 else c_out + modules.append(ResConvBlock(feats_in, my_c_in, c_mid, my_c_out, group_size, dropout_rate)) + if self_attn: + norm = lambda c_in: layers.AdaGN(feats_in, c_in, max(1, my_c_out // group_size)) + modules.append(layers.SelfAttention2d(my_c_out, max(1, my_c_out // head_size), norm, dropout_rate)) + if cross_attn: + norm = lambda c_in: layers.AdaGN(feats_in, c_in, max(1, my_c_out // group_size)) + modules.append(layers.CrossAttention2d(my_c_out, c_enc, max(1, my_c_out // head_size), norm, dropout_rate)) + super().__init__(*modules) + self.set_downsample(downsample) + + def set_downsample(self, downsample): + self[0] = layers.Downsample2d() if downsample else nn.Identity() + return self + + +class UBlock(layers.ConditionedSequential): + def __init__(self, n_layers, feats_in, c_in, c_mid, c_out, group_size=32, head_size=64, dropout_rate=0., upsample=False, self_attn=False, cross_attn=False, c_enc=0): + modules = [] + for i in range(n_layers): + my_c_in = c_in if i == 0 else c_mid + my_c_out = c_mid if i < n_layers - 1 else c_out + modules.append(ResConvBlock(feats_in, my_c_in, c_mid, my_c_out, group_size, dropout_rate)) + if self_attn: + norm = lambda c_in: layers.AdaGN(feats_in, c_in, max(1, my_c_out // group_size)) + modules.append(layers.SelfAttention2d(my_c_out, max(1, my_c_out // head_size), norm, dropout_rate)) + if cross_attn: + norm = lambda c_in: layers.AdaGN(feats_in, c_in, max(1, my_c_out // group_size)) + modules.append(layers.CrossAttention2d(my_c_out, c_enc, max(1, my_c_out // head_size), norm, dropout_rate)) + modules.append(nn.Identity()) + super().__init__(*modules) + self.set_upsample(upsample) + + def forward(self, input, cond, skip=None): + if skip is not None: + input = torch.cat([input, skip], dim=1) + return super().forward(input, cond) + + def set_upsample(self, upsample): + self[-1] = layers.Upsample2d() if upsample else nn.Identity() + return self + + +class MappingNet(nn.Sequential): + def __init__(self, feats_in, feats_out, n_layers=2): + layers = [] + for i in range(n_layers): + layers.append(orthogonal_(nn.Linear(feats_in if i == 0 else feats_out, feats_out))) + layers.append(nn.GELU()) + super().__init__(*layers) + + +class ImageDenoiserModelV1(nn.Module): + def __init__(self, c_in, feats_in, depths, channels, self_attn_depths, cross_attn_depths=None, mapping_cond_dim=0, unet_cond_dim=0, cross_cond_dim=0, dropout_rate=0., patch_size=1, skip_stages=0, has_variance=False): + super().__init__() + self.c_in = c_in + self.channels = channels + self.unet_cond_dim = unet_cond_dim + self.patch_size = patch_size + self.has_variance = has_variance + self.timestep_embed = layers.FourierFeatures(1, feats_in) + if mapping_cond_dim > 0: + self.mapping_cond = nn.Linear(mapping_cond_dim, feats_in, bias=False) + self.mapping = MappingNet(feats_in, feats_in) + self.proj_in = nn.Conv2d((c_in + unet_cond_dim) * self.patch_size ** 2, channels[max(0, skip_stages - 1)], 1) + self.proj_out = nn.Conv2d(channels[max(0, skip_stages - 1)], c_in * self.patch_size ** 2 + (1 if self.has_variance else 0), 1) + nn.init.zeros_(self.proj_out.weight) + nn.init.zeros_(self.proj_out.bias) + if cross_cond_dim == 0: + cross_attn_depths = [False] * len(self_attn_depths) + d_blocks, u_blocks = [], [] + for i in range(len(depths)): + my_c_in = channels[max(0, i - 1)] + d_blocks.append(DBlock(depths[i], feats_in, my_c_in, channels[i], channels[i], downsample=i > skip_stages, self_attn=self_attn_depths[i], cross_attn=cross_attn_depths[i], c_enc=cross_cond_dim, dropout_rate=dropout_rate)) + for i in range(len(depths)): + my_c_in = channels[i] * 2 if i < len(depths) - 1 else channels[i] + my_c_out = channels[max(0, i - 1)] + u_blocks.append(UBlock(depths[i], feats_in, my_c_in, channels[i], my_c_out, upsample=i > skip_stages, self_attn=self_attn_depths[i], cross_attn=cross_attn_depths[i], c_enc=cross_cond_dim, dropout_rate=dropout_rate)) + self.u_net = layers.UNet(d_blocks, reversed(u_blocks), skip_stages=skip_stages) + + def forward(self, input, sigma, mapping_cond=None, unet_cond=None, cross_cond=None, cross_cond_padding=None, return_variance=False): + c_noise = sigma.log() / 4 + timestep_embed = self.timestep_embed(utils.append_dims(c_noise, 2)) + mapping_cond_embed = torch.zeros_like(timestep_embed) if mapping_cond is None else self.mapping_cond(mapping_cond) + mapping_out = self.mapping(timestep_embed + mapping_cond_embed) + cond = {'cond': mapping_out} + if unet_cond is not None: + input = torch.cat([input, unet_cond], dim=1) + if cross_cond is not None: + cond['cross'] = cross_cond + cond['cross_padding'] = cross_cond_padding + if self.patch_size > 1: + input = F.pixel_unshuffle(input, self.patch_size) + input = self.proj_in(input) + input = self.u_net(input, cond) + input = self.proj_out(input) + if self.has_variance: + input, logvar = input[:, :-1], input[:, -1].flatten(1).mean(1) + if self.patch_size > 1: + input = F.pixel_shuffle(input, self.patch_size) + if self.has_variance and return_variance: + return input, logvar + return input + + def set_skip_stages(self, skip_stages): + self.proj_in = nn.Conv2d(self.proj_in.in_channels, self.channels[max(0, skip_stages - 1)], 1) + self.proj_out = nn.Conv2d(self.channels[max(0, skip_stages - 1)], self.proj_out.out_channels, 1) + nn.init.zeros_(self.proj_out.weight) + nn.init.zeros_(self.proj_out.bias) + self.u_net.skip_stages = skip_stages + for i, block in enumerate(self.u_net.d_blocks): + block.set_downsample(i > skip_stages) + for i, block in enumerate(reversed(self.u_net.u_blocks)): + block.set_upsample(i > skip_stages) + return self + + def set_patch_size(self, patch_size): + self.patch_size = patch_size + self.proj_in = nn.Conv2d((self.c_in + self.unet_cond_dim) * self.patch_size ** 2, self.channels[max(0, self.u_net.skip_stages - 1)], 1) + self.proj_out = nn.Conv2d(self.channels[max(0, self.u_net.skip_stages - 1)], self.c_in * self.patch_size ** 2 + (1 if self.has_variance else 0), 1) + nn.init.zeros_(self.proj_out.weight) + nn.init.zeros_(self.proj_out.bias) diff --git a/repositories/k-diffusion/k_diffusion/sampling.py b/repositories/k-diffusion/k_diffusion/sampling.py new file mode 100644 index 0000000000000000000000000000000000000000..6656e80b0c3a62de99b954e1fce53891c7df3a4d --- /dev/null +++ b/repositories/k-diffusion/k_diffusion/sampling.py @@ -0,0 +1,651 @@ +import math + +from scipy import integrate +import torch +from torch import nn +from torchdiffeq import odeint +import torchsde +from tqdm.auto import trange, tqdm + +from . import utils + + +def append_zero(x): + return torch.cat([x, x.new_zeros([1])]) + + +def get_sigmas_karras(n, sigma_min, sigma_max, rho=7., device='cpu'): + """Constructs the noise schedule of Karras et al. (2022).""" + ramp = torch.linspace(0, 1, n) + min_inv_rho = sigma_min ** (1 / rho) + max_inv_rho = sigma_max ** (1 / rho) + sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho + return append_zero(sigmas).to(device) + + +def get_sigmas_exponential(n, sigma_min, sigma_max, device='cpu'): + """Constructs an exponential noise schedule.""" + sigmas = torch.linspace(math.log(sigma_max), math.log(sigma_min), n, device=device).exp() + return append_zero(sigmas) + + +def get_sigmas_polyexponential(n, sigma_min, sigma_max, rho=1., device='cpu'): + """Constructs an polynomial in log sigma noise schedule.""" + ramp = torch.linspace(1, 0, n, device=device) ** rho + sigmas = torch.exp(ramp * (math.log(sigma_max) - math.log(sigma_min)) + math.log(sigma_min)) + return append_zero(sigmas) + + +def get_sigmas_vp(n, beta_d=19.9, beta_min=0.1, eps_s=1e-3, device='cpu'): + """Constructs a continuous VP noise schedule.""" + t = torch.linspace(1, eps_s, n, device=device) + sigmas = torch.sqrt(torch.exp(beta_d * t ** 2 / 2 + beta_min * t) - 1) + return append_zero(sigmas) + + +def to_d(x, sigma, denoised): + """Converts a denoiser output to a Karras ODE derivative.""" + return (x - denoised) / utils.append_dims(sigma, x.ndim) + + +def get_ancestral_step(sigma_from, sigma_to, eta=1.): + """Calculates the noise level (sigma_down) to step down to and the amount + of noise to add (sigma_up) when doing an ancestral sampling step.""" + if not eta: + return sigma_to, 0. + sigma_up = min(sigma_to, eta * (sigma_to ** 2 * (sigma_from ** 2 - sigma_to ** 2) / sigma_from ** 2) ** 0.5) + sigma_down = (sigma_to ** 2 - sigma_up ** 2) ** 0.5 + return sigma_down, sigma_up + + +def default_noise_sampler(x): + return lambda sigma, sigma_next: torch.randn_like(x) + + +class BatchedBrownianTree: + """A wrapper around torchsde.BrownianTree that enables batches of entropy.""" + + def __init__(self, x, t0, t1, seed=None, **kwargs): + t0, t1, self.sign = self.sort(t0, t1) + w0 = kwargs.get('w0', torch.zeros_like(x)) + if seed is None: + seed = torch.randint(0, 2 ** 63 - 1, []).item() + self.batched = True + try: + assert len(seed) == x.shape[0] + w0 = w0[0] + except TypeError: + seed = [seed] + self.batched = False + self.trees = [torchsde.BrownianTree(t0, w0, t1, entropy=s, **kwargs) for s in seed] + + @staticmethod + def sort(a, b): + return (a, b, 1) if a < b else (b, a, -1) + + def __call__(self, t0, t1): + t0, t1, sign = self.sort(t0, t1) + w = torch.stack([tree(t0, t1) for tree in self.trees]) * (self.sign * sign) + return w if self.batched else w[0] + + +class BrownianTreeNoiseSampler: + """A noise sampler backed by a torchsde.BrownianTree. + + Args: + x (Tensor): The tensor whose shape, device and dtype to use to generate + random samples. + sigma_min (float): The low end of the valid interval. + sigma_max (float): The high end of the valid interval. + seed (int or List[int]): The random seed. If a list of seeds is + supplied instead of a single integer, then the noise sampler will + use one BrownianTree per batch item, each with its own seed. + transform (callable): A function that maps sigma to the sampler's + internal timestep. + """ + + def __init__(self, x, sigma_min, sigma_max, seed=None, transform=lambda x: x): + self.transform = transform + t0, t1 = self.transform(torch.as_tensor(sigma_min)), self.transform(torch.as_tensor(sigma_max)) + self.tree = BatchedBrownianTree(x, t0, t1, seed) + + def __call__(self, sigma, sigma_next): + t0, t1 = self.transform(torch.as_tensor(sigma)), self.transform(torch.as_tensor(sigma_next)) + return self.tree(t0, t1) / (t1 - t0).abs().sqrt() + + +@torch.no_grad() +def sample_euler(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.): + """Implements Algorithm 2 (Euler steps) from Karras et al. (2022).""" + extra_args = {} if extra_args is None else extra_args + s_in = x.new_ones([x.shape[0]]) + for i in trange(len(sigmas) - 1, disable=disable): + gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0. + eps = torch.randn_like(x) * s_noise + sigma_hat = sigmas[i] * (gamma + 1) + if gamma > 0: + x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5 + denoised = model(x, sigma_hat * s_in, **extra_args) + d = to_d(x, sigma_hat, denoised) + if callback is not None: + callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised}) + dt = sigmas[i + 1] - sigma_hat + # Euler method + x = x + d * dt + return x + + +@torch.no_grad() +def sample_euler_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None): + """Ancestral sampling with Euler method steps.""" + extra_args = {} if extra_args is None else extra_args + noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler + s_in = x.new_ones([x.shape[0]]) + for i in trange(len(sigmas) - 1, disable=disable): + denoised = model(x, sigmas[i] * s_in, **extra_args) + sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta) + if callback is not None: + callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised}) + d = to_d(x, sigmas[i], denoised) + # Euler method + dt = sigma_down - sigmas[i] + x = x + d * dt + if sigmas[i + 1] > 0: + x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up + return x + + +@torch.no_grad() +def sample_heun(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.): + """Implements Algorithm 2 (Heun steps) from Karras et al. (2022).""" + extra_args = {} if extra_args is None else extra_args + s_in = x.new_ones([x.shape[0]]) + for i in trange(len(sigmas) - 1, disable=disable): + gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0. + eps = torch.randn_like(x) * s_noise + sigma_hat = sigmas[i] * (gamma + 1) + if gamma > 0: + x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5 + denoised = model(x, sigma_hat * s_in, **extra_args) + d = to_d(x, sigma_hat, denoised) + if callback is not None: + callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised}) + dt = sigmas[i + 1] - sigma_hat + if sigmas[i + 1] == 0: + # Euler method + x = x + d * dt + else: + # Heun's method + x_2 = x + d * dt + denoised_2 = model(x_2, sigmas[i + 1] * s_in, **extra_args) + d_2 = to_d(x_2, sigmas[i + 1], denoised_2) + d_prime = (d + d_2) / 2 + x = x + d_prime * dt + return x + + +@torch.no_grad() +def sample_dpm_2(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.): + """A sampler inspired by DPM-Solver-2 and Algorithm 2 from Karras et al. (2022).""" + extra_args = {} if extra_args is None else extra_args + s_in = x.new_ones([x.shape[0]]) + for i in trange(len(sigmas) - 1, disable=disable): + gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0. + eps = torch.randn_like(x) * s_noise + sigma_hat = sigmas[i] * (gamma + 1) + if gamma > 0: + x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5 + denoised = model(x, sigma_hat * s_in, **extra_args) + d = to_d(x, sigma_hat, denoised) + if callback is not None: + callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised}) + if sigmas[i + 1] == 0: + # Euler method + dt = sigmas[i + 1] - sigma_hat + x = x + d * dt + else: + # DPM-Solver-2 + sigma_mid = sigma_hat.log().lerp(sigmas[i + 1].log(), 0.5).exp() + dt_1 = sigma_mid - sigma_hat + dt_2 = sigmas[i + 1] - sigma_hat + x_2 = x + d * dt_1 + denoised_2 = model(x_2, sigma_mid * s_in, **extra_args) + d_2 = to_d(x_2, sigma_mid, denoised_2) + x = x + d_2 * dt_2 + return x + + +@torch.no_grad() +def sample_dpm_2_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None): + """Ancestral sampling with DPM-Solver second-order steps.""" + extra_args = {} if extra_args is None else extra_args + noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler + s_in = x.new_ones([x.shape[0]]) + for i in trange(len(sigmas) - 1, disable=disable): + denoised = model(x, sigmas[i] * s_in, **extra_args) + sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta) + if callback is not None: + callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised}) + d = to_d(x, sigmas[i], denoised) + if sigma_down == 0: + # Euler method + dt = sigma_down - sigmas[i] + x = x + d * dt + else: + # DPM-Solver-2 + sigma_mid = sigmas[i].log().lerp(sigma_down.log(), 0.5).exp() + dt_1 = sigma_mid - sigmas[i] + dt_2 = sigma_down - sigmas[i] + x_2 = x + d * dt_1 + denoised_2 = model(x_2, sigma_mid * s_in, **extra_args) + d_2 = to_d(x_2, sigma_mid, denoised_2) + x = x + d_2 * dt_2 + x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up + return x + + +def linear_multistep_coeff(order, t, i, j): + if order - 1 > i: + raise ValueError(f'Order {order} too high for step {i}') + def fn(tau): + prod = 1. + for k in range(order): + if j == k: + continue + prod *= (tau - t[i - k]) / (t[i - j] - t[i - k]) + return prod + return integrate.quad(fn, t[i], t[i + 1], epsrel=1e-4)[0] + + +@torch.no_grad() +def sample_lms(model, x, sigmas, extra_args=None, callback=None, disable=None, order=4): + extra_args = {} if extra_args is None else extra_args + s_in = x.new_ones([x.shape[0]]) + sigmas_cpu = sigmas.detach().cpu().numpy() + ds = [] + for i in trange(len(sigmas) - 1, disable=disable): + denoised = model(x, sigmas[i] * s_in, **extra_args) + d = to_d(x, sigmas[i], denoised) + ds.append(d) + if len(ds) > order: + ds.pop(0) + if callback is not None: + callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised}) + cur_order = min(i + 1, order) + coeffs = [linear_multistep_coeff(cur_order, sigmas_cpu, i, j) for j in range(cur_order)] + x = x + sum(coeff * d for coeff, d in zip(coeffs, reversed(ds))) + return x + + +@torch.no_grad() +def log_likelihood(model, x, sigma_min, sigma_max, extra_args=None, atol=1e-4, rtol=1e-4): + extra_args = {} if extra_args is None else extra_args + s_in = x.new_ones([x.shape[0]]) + v = torch.randint_like(x, 2) * 2 - 1 + fevals = 0 + def ode_fn(sigma, x): + nonlocal fevals + with torch.enable_grad(): + x = x[0].detach().requires_grad_() + denoised = model(x, sigma * s_in, **extra_args) + d = to_d(x, sigma, denoised) + fevals += 1 + grad = torch.autograd.grad((d * v).sum(), x)[0] + d_ll = (v * grad).flatten(1).sum(1) + return d.detach(), d_ll + x_min = x, x.new_zeros([x.shape[0]]) + t = x.new_tensor([sigma_min, sigma_max]) + sol = odeint(ode_fn, x_min, t, atol=atol, rtol=rtol, method='dopri5') + latent, delta_ll = sol[0][-1], sol[1][-1] + ll_prior = torch.distributions.Normal(0, sigma_max).log_prob(latent).flatten(1).sum(1) + return ll_prior + delta_ll, {'fevals': fevals} + + +class PIDStepSizeController: + """A PID controller for ODE adaptive step size control.""" + def __init__(self, h, pcoeff, icoeff, dcoeff, order=1, accept_safety=0.81, eps=1e-8): + self.h = h + self.b1 = (pcoeff + icoeff + dcoeff) / order + self.b2 = -(pcoeff + 2 * dcoeff) / order + self.b3 = dcoeff / order + self.accept_safety = accept_safety + self.eps = eps + self.errs = [] + + def limiter(self, x): + return 1 + math.atan(x - 1) + + def propose_step(self, error): + inv_error = 1 / (float(error) + self.eps) + if not self.errs: + self.errs = [inv_error, inv_error, inv_error] + self.errs[0] = inv_error + factor = self.errs[0] ** self.b1 * self.errs[1] ** self.b2 * self.errs[2] ** self.b3 + factor = self.limiter(factor) + accept = factor >= self.accept_safety + if accept: + self.errs[2] = self.errs[1] + self.errs[1] = self.errs[0] + self.h *= factor + return accept + + +class DPMSolver(nn.Module): + """DPM-Solver. See https://arxiv.org/abs/2206.00927.""" + + def __init__(self, model, extra_args=None, eps_callback=None, info_callback=None): + super().__init__() + self.model = model + self.extra_args = {} if extra_args is None else extra_args + self.eps_callback = eps_callback + self.info_callback = info_callback + + def t(self, sigma): + return -sigma.log() + + def sigma(self, t): + return t.neg().exp() + + def eps(self, eps_cache, key, x, t, *args, **kwargs): + if key in eps_cache: + return eps_cache[key], eps_cache + sigma = self.sigma(t) * x.new_ones([x.shape[0]]) + eps = (x - self.model(x, sigma, *args, **self.extra_args, **kwargs)) / self.sigma(t) + if self.eps_callback is not None: + self.eps_callback() + return eps, {key: eps, **eps_cache} + + def dpm_solver_1_step(self, x, t, t_next, eps_cache=None): + eps_cache = {} if eps_cache is None else eps_cache + h = t_next - t + eps, eps_cache = self.eps(eps_cache, 'eps', x, t) + x_1 = x - self.sigma(t_next) * h.expm1() * eps + return x_1, eps_cache + + def dpm_solver_2_step(self, x, t, t_next, r1=1 / 2, eps_cache=None): + eps_cache = {} if eps_cache is None else eps_cache + h = t_next - t + eps, eps_cache = self.eps(eps_cache, 'eps', x, t) + s1 = t + r1 * h + u1 = x - self.sigma(s1) * (r1 * h).expm1() * eps + eps_r1, eps_cache = self.eps(eps_cache, 'eps_r1', u1, s1) + x_2 = x - self.sigma(t_next) * h.expm1() * eps - self.sigma(t_next) / (2 * r1) * h.expm1() * (eps_r1 - eps) + return x_2, eps_cache + + def dpm_solver_3_step(self, x, t, t_next, r1=1 / 3, r2=2 / 3, eps_cache=None): + eps_cache = {} if eps_cache is None else eps_cache + h = t_next - t + eps, eps_cache = self.eps(eps_cache, 'eps', x, t) + s1 = t + r1 * h + s2 = t + r2 * h + u1 = x - self.sigma(s1) * (r1 * h).expm1() * eps + eps_r1, eps_cache = self.eps(eps_cache, 'eps_r1', u1, s1) + u2 = x - self.sigma(s2) * (r2 * h).expm1() * eps - self.sigma(s2) * (r2 / r1) * ((r2 * h).expm1() / (r2 * h) - 1) * (eps_r1 - eps) + eps_r2, eps_cache = self.eps(eps_cache, 'eps_r2', u2, s2) + x_3 = x - self.sigma(t_next) * h.expm1() * eps - self.sigma(t_next) / r2 * (h.expm1() / h - 1) * (eps_r2 - eps) + return x_3, eps_cache + + def dpm_solver_fast(self, x, t_start, t_end, nfe, eta=0., s_noise=1., noise_sampler=None): + noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler + if not t_end > t_start and eta: + raise ValueError('eta must be 0 for reverse sampling') + + m = math.floor(nfe / 3) + 1 + ts = torch.linspace(t_start, t_end, m + 1, device=x.device) + + if nfe % 3 == 0: + orders = [3] * (m - 2) + [2, 1] + else: + orders = [3] * (m - 1) + [nfe % 3] + + for i in range(len(orders)): + eps_cache = {} + t, t_next = ts[i], ts[i + 1] + if eta: + sd, su = get_ancestral_step(self.sigma(t), self.sigma(t_next), eta) + t_next_ = torch.minimum(t_end, self.t(sd)) + su = (self.sigma(t_next) ** 2 - self.sigma(t_next_) ** 2) ** 0.5 + else: + t_next_, su = t_next, 0. + + eps, eps_cache = self.eps(eps_cache, 'eps', x, t) + denoised = x - self.sigma(t) * eps + if self.info_callback is not None: + self.info_callback({'x': x, 'i': i, 't': ts[i], 't_up': t, 'denoised': denoised}) + + if orders[i] == 1: + x, eps_cache = self.dpm_solver_1_step(x, t, t_next_, eps_cache=eps_cache) + elif orders[i] == 2: + x, eps_cache = self.dpm_solver_2_step(x, t, t_next_, eps_cache=eps_cache) + else: + x, eps_cache = self.dpm_solver_3_step(x, t, t_next_, eps_cache=eps_cache) + + x = x + su * s_noise * noise_sampler(self.sigma(t), self.sigma(t_next)) + + return x + + def dpm_solver_adaptive(self, x, t_start, t_end, order=3, rtol=0.05, atol=0.0078, h_init=0.05, pcoeff=0., icoeff=1., dcoeff=0., accept_safety=0.81, eta=0., s_noise=1., noise_sampler=None): + noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler + if order not in {2, 3}: + raise ValueError('order should be 2 or 3') + forward = t_end > t_start + if not forward and eta: + raise ValueError('eta must be 0 for reverse sampling') + h_init = abs(h_init) * (1 if forward else -1) + atol = torch.tensor(atol) + rtol = torch.tensor(rtol) + s = t_start + x_prev = x + accept = True + pid = PIDStepSizeController(h_init, pcoeff, icoeff, dcoeff, 1.5 if eta else order, accept_safety) + info = {'steps': 0, 'nfe': 0, 'n_accept': 0, 'n_reject': 0} + + while s < t_end - 1e-5 if forward else s > t_end + 1e-5: + eps_cache = {} + t = torch.minimum(t_end, s + pid.h) if forward else torch.maximum(t_end, s + pid.h) + if eta: + sd, su = get_ancestral_step(self.sigma(s), self.sigma(t), eta) + t_ = torch.minimum(t_end, self.t(sd)) + su = (self.sigma(t) ** 2 - self.sigma(t_) ** 2) ** 0.5 + else: + t_, su = t, 0. + + eps, eps_cache = self.eps(eps_cache, 'eps', x, s) + denoised = x - self.sigma(s) * eps + + if order == 2: + x_low, eps_cache = self.dpm_solver_1_step(x, s, t_, eps_cache=eps_cache) + x_high, eps_cache = self.dpm_solver_2_step(x, s, t_, eps_cache=eps_cache) + else: + x_low, eps_cache = self.dpm_solver_2_step(x, s, t_, r1=1 / 3, eps_cache=eps_cache) + x_high, eps_cache = self.dpm_solver_3_step(x, s, t_, eps_cache=eps_cache) + delta = torch.maximum(atol, rtol * torch.maximum(x_low.abs(), x_prev.abs())) + error = torch.linalg.norm((x_low - x_high) / delta) / x.numel() ** 0.5 + accept = pid.propose_step(error) + if accept: + x_prev = x_low + x = x_high + su * s_noise * noise_sampler(self.sigma(s), self.sigma(t)) + s = t + info['n_accept'] += 1 + else: + info['n_reject'] += 1 + info['nfe'] += order + info['steps'] += 1 + + if self.info_callback is not None: + self.info_callback({'x': x, 'i': info['steps'] - 1, 't': s, 't_up': s, 'denoised': denoised, 'error': error, 'h': pid.h, **info}) + + return x, info + + +@torch.no_grad() +def sample_dpm_fast(model, x, sigma_min, sigma_max, n, extra_args=None, callback=None, disable=None, eta=0., s_noise=1., noise_sampler=None): + """DPM-Solver-Fast (fixed step size). See https://arxiv.org/abs/2206.00927.""" + if sigma_min <= 0 or sigma_max <= 0: + raise ValueError('sigma_min and sigma_max must not be 0') + with tqdm(total=n, disable=disable) as pbar: + dpm_solver = DPMSolver(model, extra_args, eps_callback=pbar.update) + if callback is not None: + dpm_solver.info_callback = lambda info: callback({'sigma': dpm_solver.sigma(info['t']), 'sigma_hat': dpm_solver.sigma(info['t_up']), **info}) + return dpm_solver.dpm_solver_fast(x, dpm_solver.t(torch.tensor(sigma_max)), dpm_solver.t(torch.tensor(sigma_min)), n, eta, s_noise, noise_sampler) + + +@torch.no_grad() +def sample_dpm_adaptive(model, x, sigma_min, sigma_max, extra_args=None, callback=None, disable=None, order=3, rtol=0.05, atol=0.0078, h_init=0.05, pcoeff=0., icoeff=1., dcoeff=0., accept_safety=0.81, eta=0., s_noise=1., noise_sampler=None, return_info=False): + """DPM-Solver-12 and 23 (adaptive step size). See https://arxiv.org/abs/2206.00927.""" + if sigma_min <= 0 or sigma_max <= 0: + raise ValueError('sigma_min and sigma_max must not be 0') + with tqdm(disable=disable) as pbar: + dpm_solver = DPMSolver(model, extra_args, eps_callback=pbar.update) + if callback is not None: + dpm_solver.info_callback = lambda info: callback({'sigma': dpm_solver.sigma(info['t']), 'sigma_hat': dpm_solver.sigma(info['t_up']), **info}) + x, info = dpm_solver.dpm_solver_adaptive(x, dpm_solver.t(torch.tensor(sigma_max)), dpm_solver.t(torch.tensor(sigma_min)), order, rtol, atol, h_init, pcoeff, icoeff, dcoeff, accept_safety, eta, s_noise, noise_sampler) + if return_info: + return x, info + return x + + +@torch.no_grad() +def sample_dpmpp_2s_ancestral(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None): + """Ancestral sampling with DPM-Solver++(2S) second-order steps.""" + extra_args = {} if extra_args is None else extra_args + noise_sampler = default_noise_sampler(x) if noise_sampler is None else noise_sampler + s_in = x.new_ones([x.shape[0]]) + sigma_fn = lambda t: t.neg().exp() + t_fn = lambda sigma: sigma.log().neg() + + for i in trange(len(sigmas) - 1, disable=disable): + denoised = model(x, sigmas[i] * s_in, **extra_args) + sigma_down, sigma_up = get_ancestral_step(sigmas[i], sigmas[i + 1], eta=eta) + if callback is not None: + callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised}) + if sigma_down == 0: + # Euler method + d = to_d(x, sigmas[i], denoised) + dt = sigma_down - sigmas[i] + x = x + d * dt + else: + # DPM-Solver++(2S) + t, t_next = t_fn(sigmas[i]), t_fn(sigma_down) + r = 1 / 2 + h = t_next - t + s = t + r * h + x_2 = (sigma_fn(s) / sigma_fn(t)) * x - (-h * r).expm1() * denoised + denoised_2 = model(x_2, sigma_fn(s) * s_in, **extra_args) + x = (sigma_fn(t_next) / sigma_fn(t)) * x - (-h).expm1() * denoised_2 + # Noise addition + if sigmas[i + 1] > 0: + x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up + return x + + +@torch.no_grad() +def sample_dpmpp_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, r=1 / 2): + """DPM-Solver++ (stochastic).""" + sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max() + noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max) if noise_sampler is None else noise_sampler + extra_args = {} if extra_args is None else extra_args + s_in = x.new_ones([x.shape[0]]) + sigma_fn = lambda t: t.neg().exp() + t_fn = lambda sigma: sigma.log().neg() + + for i in trange(len(sigmas) - 1, disable=disable): + denoised = model(x, sigmas[i] * s_in, **extra_args) + if callback is not None: + callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised}) + if sigmas[i + 1] == 0: + # Euler method + d = to_d(x, sigmas[i], denoised) + dt = sigmas[i + 1] - sigmas[i] + x = x + d * dt + else: + # DPM-Solver++ + t, t_next = t_fn(sigmas[i]), t_fn(sigmas[i + 1]) + h = t_next - t + s = t + h * r + fac = 1 / (2 * r) + + # Step 1 + sd, su = get_ancestral_step(sigma_fn(t), sigma_fn(s), eta) + s_ = t_fn(sd) + x_2 = (sigma_fn(s_) / sigma_fn(t)) * x - (t - s_).expm1() * denoised + x_2 = x_2 + noise_sampler(sigma_fn(t), sigma_fn(s)) * s_noise * su + denoised_2 = model(x_2, sigma_fn(s) * s_in, **extra_args) + + # Step 2 + sd, su = get_ancestral_step(sigma_fn(t), sigma_fn(t_next), eta) + t_next_ = t_fn(sd) + denoised_d = (1 - fac) * denoised + fac * denoised_2 + x = (sigma_fn(t_next_) / sigma_fn(t)) * x - (t - t_next_).expm1() * denoised_d + x = x + noise_sampler(sigma_fn(t), sigma_fn(t_next)) * s_noise * su + return x + + +@torch.no_grad() +def sample_dpmpp_2m(model, x, sigmas, extra_args=None, callback=None, disable=None): + """DPM-Solver++(2M).""" + extra_args = {} if extra_args is None else extra_args + s_in = x.new_ones([x.shape[0]]) + sigma_fn = lambda t: t.neg().exp() + t_fn = lambda sigma: sigma.log().neg() + old_denoised = None + + for i in trange(len(sigmas) - 1, disable=disable): + denoised = model(x, sigmas[i] * s_in, **extra_args) + if callback is not None: + callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised}) + t, t_next = t_fn(sigmas[i]), t_fn(sigmas[i + 1]) + h = t_next - t + if old_denoised is None or sigmas[i + 1] == 0: + x = (sigma_fn(t_next) / sigma_fn(t)) * x - (-h).expm1() * denoised + else: + h_last = t - t_fn(sigmas[i - 1]) + r = h_last / h + denoised_d = (1 + 1 / (2 * r)) * denoised - (1 / (2 * r)) * old_denoised + x = (sigma_fn(t_next) / sigma_fn(t)) * x - (-h).expm1() * denoised_d + old_denoised = denoised + return x + + +@torch.no_grad() +def sample_dpmpp_2m_sde(model, x, sigmas, extra_args=None, callback=None, disable=None, eta=1., s_noise=1., noise_sampler=None, solver_type='midpoint'): + """DPM-Solver++(2M) SDE.""" + + if solver_type not in {'heun', 'midpoint'}: + raise ValueError('solver_type must be \'heun\' or \'midpoint\'') + + sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas.max() + noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max) if noise_sampler is None else noise_sampler + extra_args = {} if extra_args is None else extra_args + s_in = x.new_ones([x.shape[0]]) + + old_denoised = None + h_last = None + + for i in trange(len(sigmas) - 1, disable=disable): + denoised = model(x, sigmas[i] * s_in, **extra_args) + if callback is not None: + callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigmas[i], 'denoised': denoised}) + if sigmas[i + 1] == 0: + # Denoising step + x = denoised + else: + # DPM-Solver++(2M) SDE + t, s = -sigmas[i].log(), -sigmas[i + 1].log() + h = s - t + eta_h = eta * h + + x = sigmas[i + 1] / sigmas[i] * (-eta_h).exp() * x + (-h - eta_h).expm1().neg() * denoised + + if old_denoised is not None: + r = h_last / h + if solver_type == 'heun': + x = x + ((-h - eta_h).expm1().neg() / (-h - eta_h) + 1) * (1 / r) * (denoised - old_denoised) + elif solver_type == 'midpoint': + x = x + 0.5 * (-h - eta_h).expm1().neg() * (1 / r) * (denoised - old_denoised) + + x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * sigmas[i + 1] * (-2 * eta_h).expm1().neg().sqrt() * s_noise + + old_denoised = denoised + h_last = h + return x diff --git a/repositories/k-diffusion/k_diffusion/utils.py b/repositories/k-diffusion/k_diffusion/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9afedb99276d55d5b923a04ffb62d403c9dfae93 --- /dev/null +++ b/repositories/k-diffusion/k_diffusion/utils.py @@ -0,0 +1,329 @@ +from contextlib import contextmanager +import hashlib +import math +from pathlib import Path +import shutil +import urllib +import warnings + +from PIL import Image +import torch +from torch import nn, optim +from torch.utils import data +from torchvision.transforms import functional as TF + + +def from_pil_image(x): + """Converts from a PIL image to a tensor.""" + x = TF.to_tensor(x) + if x.ndim == 2: + x = x[..., None] + return x * 2 - 1 + + +def to_pil_image(x): + """Converts from a tensor to a PIL image.""" + if x.ndim == 4: + assert x.shape[0] == 1 + x = x[0] + if x.shape[0] == 1: + x = x[0] + return TF.to_pil_image((x.clamp(-1, 1) + 1) / 2) + + +def hf_datasets_augs_helper(examples, transform, image_key, mode='RGB'): + """Apply passed in transforms for HuggingFace Datasets.""" + images = [transform(image.convert(mode)) for image in examples[image_key]] + return {image_key: images} + + +def append_dims(x, target_dims): + """Appends dimensions to the end of a tensor until it has target_dims dimensions.""" + dims_to_append = target_dims - x.ndim + if dims_to_append < 0: + raise ValueError(f'input has {x.ndim} dims but target_dims is {target_dims}, which is less') + return x[(...,) + (None,) * dims_to_append] + + +def n_params(module): + """Returns the number of trainable parameters in a module.""" + return sum(p.numel() for p in module.parameters()) + + +def download_file(path, url, digest=None): + """Downloads a file if it does not exist, optionally checking its SHA-256 hash.""" + path = Path(path) + path.parent.mkdir(parents=True, exist_ok=True) + if not path.exists(): + with urllib.request.urlopen(url) as response, open(path, 'wb') as f: + shutil.copyfileobj(response, f) + if digest is not None: + file_digest = hashlib.sha256(open(path, 'rb').read()).hexdigest() + if digest != file_digest: + raise OSError(f'hash of {path} (url: {url}) failed to validate') + return path + + +@contextmanager +def train_mode(model, mode=True): + """A context manager that places a model into training mode and restores + the previous mode on exit.""" + modes = [module.training for module in model.modules()] + try: + yield model.train(mode) + finally: + for i, module in enumerate(model.modules()): + module.training = modes[i] + + +def eval_mode(model): + """A context manager that places a model into evaluation mode and restores + the previous mode on exit.""" + return train_mode(model, False) + + +@torch.no_grad() +def ema_update(model, averaged_model, decay): + """Incorporates updated model parameters into an exponential moving averaged + version of a model. It should be called after each optimizer step.""" + model_params = dict(model.named_parameters()) + averaged_params = dict(averaged_model.named_parameters()) + assert model_params.keys() == averaged_params.keys() + + for name, param in model_params.items(): + averaged_params[name].mul_(decay).add_(param, alpha=1 - decay) + + model_buffers = dict(model.named_buffers()) + averaged_buffers = dict(averaged_model.named_buffers()) + assert model_buffers.keys() == averaged_buffers.keys() + + for name, buf in model_buffers.items(): + averaged_buffers[name].copy_(buf) + + +class EMAWarmup: + """Implements an EMA warmup using an inverse decay schedule. + If inv_gamma=1 and power=1, implements a simple average. inv_gamma=1, power=2/3 are + good values for models you plan to train for a million or more steps (reaches decay + factor 0.999 at 31.6K steps, 0.9999 at 1M steps), inv_gamma=1, power=3/4 for models + you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999 at + 215.4k steps). + Args: + inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1. + power (float): Exponential factor of EMA warmup. Default: 1. + min_value (float): The minimum EMA decay rate. Default: 0. + max_value (float): The maximum EMA decay rate. Default: 1. + start_at (int): The epoch to start averaging at. Default: 0. + last_epoch (int): The index of last epoch. Default: 0. + """ + + def __init__(self, inv_gamma=1., power=1., min_value=0., max_value=1., start_at=0, + last_epoch=0): + self.inv_gamma = inv_gamma + self.power = power + self.min_value = min_value + self.max_value = max_value + self.start_at = start_at + self.last_epoch = last_epoch + + def state_dict(self): + """Returns the state of the class as a :class:`dict`.""" + return dict(self.__dict__.items()) + + def load_state_dict(self, state_dict): + """Loads the class's state. + Args: + state_dict (dict): scaler state. Should be an object returned + from a call to :meth:`state_dict`. + """ + self.__dict__.update(state_dict) + + def get_value(self): + """Gets the current EMA decay rate.""" + epoch = max(0, self.last_epoch - self.start_at) + value = 1 - (1 + epoch / self.inv_gamma) ** -self.power + return 0. if epoch < 0 else min(self.max_value, max(self.min_value, value)) + + def step(self): + """Updates the step count.""" + self.last_epoch += 1 + + +class InverseLR(optim.lr_scheduler._LRScheduler): + """Implements an inverse decay learning rate schedule with an optional exponential + warmup. When last_epoch=-1, sets initial lr as lr. + inv_gamma is the number of steps/epochs required for the learning rate to decay to + (1 / 2)**power of its original value. + Args: + optimizer (Optimizer): Wrapped optimizer. + inv_gamma (float): Inverse multiplicative factor of learning rate decay. Default: 1. + power (float): Exponential factor of learning rate decay. Default: 1. + warmup (float): Exponential warmup factor (0 <= warmup < 1, 0 to disable) + Default: 0. + min_lr (float): The minimum learning rate. Default: 0. + last_epoch (int): The index of last epoch. Default: -1. + verbose (bool): If ``True``, prints a message to stdout for + each update. Default: ``False``. + """ + + def __init__(self, optimizer, inv_gamma=1., power=1., warmup=0., min_lr=0., + last_epoch=-1, verbose=False): + self.inv_gamma = inv_gamma + self.power = power + if not 0. <= warmup < 1: + raise ValueError('Invalid value for warmup') + self.warmup = warmup + self.min_lr = min_lr + super().__init__(optimizer, last_epoch, verbose) + + def get_lr(self): + if not self._get_lr_called_within_step: + warnings.warn("To get the last learning rate computed by the scheduler, " + "please use `get_last_lr()`.") + + return self._get_closed_form_lr() + + def _get_closed_form_lr(self): + warmup = 1 - self.warmup ** (self.last_epoch + 1) + lr_mult = (1 + self.last_epoch / self.inv_gamma) ** -self.power + return [warmup * max(self.min_lr, base_lr * lr_mult) + for base_lr in self.base_lrs] + + +class ExponentialLR(optim.lr_scheduler._LRScheduler): + """Implements an exponential learning rate schedule with an optional exponential + warmup. When last_epoch=-1, sets initial lr as lr. Decays the learning rate + continuously by decay (default 0.5) every num_steps steps. + Args: + optimizer (Optimizer): Wrapped optimizer. + num_steps (float): The number of steps to decay the learning rate by decay in. + decay (float): The factor by which to decay the learning rate every num_steps + steps. Default: 0.5. + warmup (float): Exponential warmup factor (0 <= warmup < 1, 0 to disable) + Default: 0. + min_lr (float): The minimum learning rate. Default: 0. + last_epoch (int): The index of last epoch. Default: -1. + verbose (bool): If ``True``, prints a message to stdout for + each update. Default: ``False``. + """ + + def __init__(self, optimizer, num_steps, decay=0.5, warmup=0., min_lr=0., + last_epoch=-1, verbose=False): + self.num_steps = num_steps + self.decay = decay + if not 0. <= warmup < 1: + raise ValueError('Invalid value for warmup') + self.warmup = warmup + self.min_lr = min_lr + super().__init__(optimizer, last_epoch, verbose) + + def get_lr(self): + if not self._get_lr_called_within_step: + warnings.warn("To get the last learning rate computed by the scheduler, " + "please use `get_last_lr()`.") + + return self._get_closed_form_lr() + + def _get_closed_form_lr(self): + warmup = 1 - self.warmup ** (self.last_epoch + 1) + lr_mult = (self.decay ** (1 / self.num_steps)) ** self.last_epoch + return [warmup * max(self.min_lr, base_lr * lr_mult) + for base_lr in self.base_lrs] + + +def rand_log_normal(shape, loc=0., scale=1., device='cpu', dtype=torch.float32): + """Draws samples from an lognormal distribution.""" + return (torch.randn(shape, device=device, dtype=dtype) * scale + loc).exp() + + +def rand_log_logistic(shape, loc=0., scale=1., min_value=0., max_value=float('inf'), device='cpu', dtype=torch.float32): + """Draws samples from an optionally truncated log-logistic distribution.""" + min_value = torch.as_tensor(min_value, device=device, dtype=torch.float64) + max_value = torch.as_tensor(max_value, device=device, dtype=torch.float64) + min_cdf = min_value.log().sub(loc).div(scale).sigmoid() + max_cdf = max_value.log().sub(loc).div(scale).sigmoid() + u = torch.rand(shape, device=device, dtype=torch.float64) * (max_cdf - min_cdf) + min_cdf + return u.logit().mul(scale).add(loc).exp().to(dtype) + + +def rand_log_uniform(shape, min_value, max_value, device='cpu', dtype=torch.float32): + """Draws samples from an log-uniform distribution.""" + min_value = math.log(min_value) + max_value = math.log(max_value) + return (torch.rand(shape, device=device, dtype=dtype) * (max_value - min_value) + min_value).exp() + + +def rand_v_diffusion(shape, sigma_data=1., min_value=0., max_value=float('inf'), device='cpu', dtype=torch.float32): + """Draws samples from a truncated v-diffusion training timestep distribution.""" + min_cdf = math.atan(min_value / sigma_data) * 2 / math.pi + max_cdf = math.atan(max_value / sigma_data) * 2 / math.pi + u = torch.rand(shape, device=device, dtype=dtype) * (max_cdf - min_cdf) + min_cdf + return torch.tan(u * math.pi / 2) * sigma_data + + +def rand_split_log_normal(shape, loc, scale_1, scale_2, device='cpu', dtype=torch.float32): + """Draws samples from a split lognormal distribution.""" + n = torch.randn(shape, device=device, dtype=dtype).abs() + u = torch.rand(shape, device=device, dtype=dtype) + n_left = n * -scale_1 + loc + n_right = n * scale_2 + loc + ratio = scale_1 / (scale_1 + scale_2) + return torch.where(u < ratio, n_left, n_right).exp() + + +class FolderOfImages(data.Dataset): + """Recursively finds all images in a directory. It does not support + classes/targets.""" + + IMG_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif', '.tiff', '.webp'} + + def __init__(self, root, transform=None): + super().__init__() + self.root = Path(root) + self.transform = nn.Identity() if transform is None else transform + self.paths = sorted(path for path in self.root.rglob('*') if path.suffix.lower() in self.IMG_EXTENSIONS) + + def __repr__(self): + return f'FolderOfImages(root="{self.root}", len: {len(self)})' + + def __len__(self): + return len(self.paths) + + def __getitem__(self, key): + path = self.paths[key] + with open(path, 'rb') as f: + image = Image.open(f).convert('RGB') + image = self.transform(image) + return image, + + +class CSVLogger: + def __init__(self, filename, columns): + self.filename = Path(filename) + self.columns = columns + if self.filename.exists(): + self.file = open(self.filename, 'a') + else: + self.file = open(self.filename, 'w') + self.write(*self.columns) + + def write(self, *args): + print(*args, sep=',', file=self.file, flush=True) + + +@contextmanager +def tf32_mode(cudnn=None, matmul=None): + """A context manager that sets whether TF32 is allowed on cuDNN or matmul.""" + cudnn_old = torch.backends.cudnn.allow_tf32 + matmul_old = torch.backends.cuda.matmul.allow_tf32 + try: + if cudnn is not None: + torch.backends.cudnn.allow_tf32 = cudnn + if matmul is not None: + torch.backends.cuda.matmul.allow_tf32 = matmul + yield + finally: + if cudnn is not None: + torch.backends.cudnn.allow_tf32 = cudnn_old + if matmul is not None: + torch.backends.cuda.matmul.allow_tf32 = matmul_old diff --git a/repositories/k-diffusion/make_grid.py b/repositories/k-diffusion/make_grid.py new file mode 100644 index 0000000000000000000000000000000000000000..0c6616843cac1a69fdb94df804822cf07b533543 --- /dev/null +++ b/repositories/k-diffusion/make_grid.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 + +"""Assembles images into a grid.""" + +import argparse +import math +import sys + +from PIL import Image + + +def main(): + p = argparse.ArgumentParser(description=__doc__) + p.add_argument('images', type=str, nargs='+', metavar='image', + help='the input images') + p.add_argument('--output', '-o', type=str, default='out.png', + help='the output image') + p.add_argument('--nrow', type=int, + help='the number of images per row') + args = p.parse_args() + + images = [Image.open(image) for image in args.images] + mode = images[0].mode + size = images[0].size + for image, name in zip(images, args.images): + if image.mode != mode: + print(f'Error: Image {name} had mode {image.mode}, expected {mode}', file=sys.stderr) + sys.exit(1) + if image.size != size: + print(f'Error: Image {name} had size {image.size}, expected {size}', file=sys.stderr) + sys.exit(1) + + n = len(images) + x = args.nrow if args.nrow else math.ceil(n**0.5) + y = math.ceil(n / x) + + output = Image.new(mode, (size[0] * x, size[1] * y)) + for i, image in enumerate(images): + cur_x, cur_y = i % x, i // x + output.paste(image, (size[0] * cur_x, size[1] * cur_y)) + + output.save(args.output) + + +if __name__ == '__main__': + main() diff --git a/repositories/k-diffusion/pyproject.toml b/repositories/k-diffusion/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..fed528d4a7a148fd0bf0b0198a6461f8c91b87e9 --- /dev/null +++ b/repositories/k-diffusion/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" diff --git a/repositories/k-diffusion/requirements.txt b/repositories/k-diffusion/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..497553b0c7d493cf11d0278f5855ff296ad5e02a --- /dev/null +++ b/repositories/k-diffusion/requirements.txt @@ -0,0 +1,16 @@ +accelerate +clean-fid +clip-anytorch +einops +jsonmerge +kornia +Pillow +resize-right +scikit-image +scipy +torch +torchdiffeq +torchsde +torchvision +tqdm +wandb diff --git a/repositories/k-diffusion/sample.py b/repositories/k-diffusion/sample.py new file mode 100644 index 0000000000000000000000000000000000000000..21e0dc3c9ca055f7de73b7df7aa2841025187c18 --- /dev/null +++ b/repositories/k-diffusion/sample.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 + +"""Samples from k-diffusion models.""" + +import argparse +import math + +import accelerate +import torch +from tqdm import trange, tqdm + +import k_diffusion as K + + +def main(): + p = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + p.add_argument('--batch-size', type=int, default=64, + help='the batch size') + p.add_argument('--checkpoint', type=str, required=True, + help='the checkpoint to use') + p.add_argument('--config', type=str, required=True, + help='the model config') + p.add_argument('-n', type=int, default=64, + help='the number of images to sample') + p.add_argument('--prefix', type=str, default='out', + help='the output prefix') + p.add_argument('--steps', type=int, default=50, + help='the number of denoising steps') + args = p.parse_args() + + config = K.config.load_config(open(args.config)) + model_config = config['model'] + # TODO: allow non-square input sizes + assert len(model_config['input_size']) == 2 and model_config['input_size'][0] == model_config['input_size'][1] + size = model_config['input_size'] + + accelerator = accelerate.Accelerator() + device = accelerator.device + print('Using device:', device, flush=True) + + inner_model = K.config.make_model(config).eval().requires_grad_(False).to(device) + inner_model.load_state_dict(torch.load(args.checkpoint, map_location='cpu')['model_ema']) + accelerator.print('Parameters:', K.utils.n_params(inner_model)) + model = K.Denoiser(inner_model, sigma_data=model_config['sigma_data']) + + sigma_min = model_config['sigma_min'] + sigma_max = model_config['sigma_max'] + + @torch.no_grad() + @K.utils.eval_mode(model) + def run(): + if accelerator.is_local_main_process: + tqdm.write('Sampling...') + sigmas = K.sampling.get_sigmas_karras(args.steps, sigma_min, sigma_max, rho=7., device=device) + def sample_fn(n): + x = torch.randn([n, model_config['input_channels'], size[0], size[1]], device=device) * sigma_max + x_0 = K.sampling.sample_lms(model, x, sigmas, disable=not accelerator.is_local_main_process) + return x_0 + x_0 = K.evaluation.compute_features(accelerator, sample_fn, lambda x: x, args.n, args.batch_size) + if accelerator.is_main_process: + for i, out in enumerate(x_0): + filename = f'{args.prefix}_{i:05}.png' + K.utils.to_pil_image(out).save(filename) + + try: + run() + except KeyboardInterrupt: + pass + + +if __name__ == '__main__': + main() diff --git a/repositories/k-diffusion/sample_clip_guided.py b/repositories/k-diffusion/sample_clip_guided.py new file mode 100644 index 0000000000000000000000000000000000000000..592350196fbbac8479563be5be9e138248d94c86 --- /dev/null +++ b/repositories/k-diffusion/sample_clip_guided.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 + +"""CLIP guided sampling from k-diffusion models.""" + +import argparse +import math + +import accelerate +import clip +from kornia import augmentation as KA +from resize_right import resize +import torch +from torch.nn import functional as F +from torchvision import transforms +from tqdm import trange, tqdm + +import k_diffusion as K + + +def spherical_dist_loss(x, y): + x = F.normalize(x, dim=-1) + y = F.normalize(y, dim=-1) + return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2) + + +def make_cond_model_fn(model, cond_fn): + def model_fn(x, sigma, **kwargs): + with torch.enable_grad(): + x = x.detach().requires_grad_() + denoised = model(x, sigma, **kwargs) + cond_grad = cond_fn(x, sigma, denoised=denoised, **kwargs).detach() + cond_denoised = denoised.detach() + cond_grad * K.utils.append_dims(sigma**2, x.ndim) + return cond_denoised + return model_fn + + +def make_static_thresh_model_fn(model, value=1.): + def model_fn(x, sigma, **kwargs): + return model(x, sigma, **kwargs).clamp(-value, value) + return model_fn + + +def main(): + p = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + p.add_argument('prompt', type=str, + default='the prompt to use') + p.add_argument('--batch-size', type=int, default=16, + help='the batch size') + p.add_argument('--checkpoint', type=str, required=True, + help='the checkpoint to use') + p.add_argument('--clip-guidance-scale', '-cgs', type=float, default=500., + help='the CLIP guidance scale') + p.add_argument('--clip-model', type=str, default='ViT-B/16', choices=clip.available_models(), + help='the CLIP model to use') + p.add_argument('--config', type=str, required=True, + help='the model config') + p.add_argument('-n', type=int, default=64, + help='the number of images to sample') + p.add_argument('--prefix', type=str, default='out', + help='the output prefix') + p.add_argument('--steps', type=int, default=100, + help='the number of denoising steps') + args = p.parse_args() + + config = K.config.load_config(open(args.config)) + model_config = config['model'] + # TODO: allow non-square input sizes + assert len(model_config['input_size']) == 2 and model_config['input_size'][0] == model_config['input_size'][1] + size = model_config['input_size'] + + accelerator = accelerate.Accelerator() + device = accelerator.device + print('Using device:', device, flush=True) + + inner_model = K.config.make_model(config).eval().requires_grad_(False).to(device) + inner_model.load_state_dict(torch.load(args.checkpoint, map_location='cpu')['model_ema']) + accelerator.print('Parameters:', K.utils.n_params(inner_model)) + model = K.Denoiser(inner_model, sigma_data=model_config['sigma_data']) + + sigma_min = model_config['sigma_min'] + sigma_max = model_config['sigma_max'] + + clip_model = clip.load(args.clip_model, device=device)[0].eval().requires_grad_(False) + clip_normalize = transforms.Normalize(mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)) + clip_size = (clip_model.visual.input_resolution, clip_model.visual.input_resolution) + aug = KA.RandomAffine(0, (1/14, 1/14), p=1, padding_mode='border') + + def get_image_embed(x): + if x.shape[2:4] != clip_size: + x = resize(x, out_shape=clip_size, pad_mode='reflect') + x = clip_normalize(x) + x = clip_model.encode_image(x).float() + return F.normalize(x) + + target_embed = F.normalize(clip_model.encode_text(clip.tokenize(args.prompt, truncate=True).to(device)).float()) + + def cond_fn(x, t, denoised): + image_embed = get_image_embed(aug(denoised.add(1).div(2))) + loss = spherical_dist_loss(image_embed, target_embed).sum() * args.clip_guidance_scale + grad = -torch.autograd.grad(loss, x)[0] + return grad + + model_fn = make_cond_model_fn(model, cond_fn) + model_fn = make_static_thresh_model_fn(model_fn) + + @torch.no_grad() + @K.utils.eval_mode(model) + def run(): + if accelerator.is_local_main_process: + tqdm.write('Sampling...') + sigmas = K.sampling.get_sigmas_karras(args.steps, sigma_min, sigma_max, rho=7., device=device) + def sample_fn(n): + x = torch.randn([n, model_config['input_channels'], size[0], size[1]], device=device) * sigmas[0] + x_0 = K.sampling.sample_dpmpp_2s_ancestral(model_fn, x, sigmas, eta=1., disable=not accelerator.is_local_main_process) + return x_0 + x_0 = K.evaluation.compute_features(accelerator, sample_fn, lambda x: x, args.n, args.batch_size) + if accelerator.is_main_process: + for i, out in enumerate(x_0): + filename = f'{args.prefix}_{i:05}.png' + K.utils.to_pil_image(out).save(filename) + + try: + run() + except KeyboardInterrupt: + pass + + +if __name__ == '__main__': + main() diff --git a/repositories/k-diffusion/setup.cfg b/repositories/k-diffusion/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..6678e78268ce97bf7d02c757615df514ab9e2dc0 --- /dev/null +++ b/repositories/k-diffusion/setup.cfg @@ -0,0 +1,30 @@ +[metadata] +name = k-diffusion +version = 0.0.15 +author = Katherine Crowson +author_email = crowsonkb@gmail.com +url = https://github.com/crowsonkb/k-diffusion +description = Karras et al. (2022) diffusion models for PyTorch +long_description = file: README.md +long_description_content_type = text/markdown +license = MIT + +[options] +packages = find: +install_requires = + accelerate + clean-fid + clip-anytorch + einops + jsonmerge + kornia + Pillow + resize-right + scikit-image + scipy + torch + torchdiffeq + torchsde + torchvision + tqdm + wandb diff --git a/repositories/k-diffusion/setup.py b/repositories/k-diffusion/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..0ae4555937eb30e6632281a2326726826a41fe88 --- /dev/null +++ b/repositories/k-diffusion/setup.py @@ -0,0 +1,5 @@ +from setuptools import setup + + +if __name__ == '__main__': + setup() diff --git a/repositories/k-diffusion/train.py b/repositories/k-diffusion/train.py new file mode 100644 index 0000000000000000000000000000000000000000..1ba614c6a8edf66755384cc95ddcb2475832234c --- /dev/null +++ b/repositories/k-diffusion/train.py @@ -0,0 +1,356 @@ +#!/usr/bin/env python3 + +"""Trains Karras et al. (2022) diffusion models.""" + +import argparse +from copy import deepcopy +from functools import partial +import math +import json +from pathlib import Path + +import accelerate +import torch +from torch import nn, optim +from torch import multiprocessing as mp +from torch.utils import data +from torchvision import datasets, transforms, utils +from tqdm.auto import trange, tqdm + +import k_diffusion as K + + +def main(): + p = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + p.add_argument('--batch-size', type=int, default=64, + help='the batch size') + p.add_argument('--config', type=str, required=True, + help='the configuration file') + p.add_argument('--demo-every', type=int, default=500, + help='save a demo grid every this many steps') + p.add_argument('--evaluate-every', type=int, default=10000, + help='save a demo grid every this many steps') + p.add_argument('--evaluate-n', type=int, default=2000, + help='the number of samples to draw to evaluate') + p.add_argument('--gns', action='store_true', + help='measure the gradient noise scale (DDP only)') + p.add_argument('--grad-accum-steps', type=int, default=1, + help='the number of gradient accumulation steps') + p.add_argument('--grow', type=str, + help='the checkpoint to grow from') + p.add_argument('--grow-config', type=str, + help='the configuration file of the model to grow from') + p.add_argument('--lr', type=float, + help='the learning rate') + p.add_argument('--name', type=str, default='model', + help='the name of the run') + p.add_argument('--num-workers', type=int, default=8, + help='the number of data loader workers') + p.add_argument('--resume', type=str, + help='the checkpoint to resume from') + p.add_argument('--sample-n', type=int, default=64, + help='the number of images to sample for demo grids') + p.add_argument('--save-every', type=int, default=10000, + help='save every this many steps') + p.add_argument('--seed', type=int, + help='the random seed') + p.add_argument('--start-method', type=str, default='spawn', + choices=['fork', 'forkserver', 'spawn'], + help='the multiprocessing start method') + p.add_argument('--wandb-entity', type=str, + help='the wandb entity name') + p.add_argument('--wandb-group', type=str, + help='the wandb group name') + p.add_argument('--wandb-project', type=str, + help='the wandb project name (specify this to enable wandb)') + p.add_argument('--wandb-save-model', action='store_true', + help='save model to wandb') + args = p.parse_args() + + mp.set_start_method(args.start_method) + torch.backends.cuda.matmul.allow_tf32 = True + + config = K.config.load_config(open(args.config)) + model_config = config['model'] + dataset_config = config['dataset'] + opt_config = config['optimizer'] + sched_config = config['lr_sched'] + ema_sched_config = config['ema_sched'] + + # TODO: allow non-square input sizes + assert len(model_config['input_size']) == 2 and model_config['input_size'][0] == model_config['input_size'][1] + size = model_config['input_size'] + + ddp_kwargs = accelerate.DistributedDataParallelKwargs(find_unused_parameters=model_config['skip_stages'] > 0) + accelerator = accelerate.Accelerator(kwargs_handlers=[ddp_kwargs], gradient_accumulation_steps=args.grad_accum_steps) + device = accelerator.device + print(f'Process {accelerator.process_index} using device: {device}', flush=True) + + if args.seed is not None: + seeds = torch.randint(-2 ** 63, 2 ** 63 - 1, [accelerator.num_processes], generator=torch.Generator().manual_seed(args.seed)) + torch.manual_seed(seeds[accelerator.process_index]) + + inner_model = K.config.make_model(config) + inner_model_ema = deepcopy(inner_model) + if accelerator.is_main_process: + print('Parameters:', K.utils.n_params(inner_model)) + + # If logging to wandb, initialize the run + use_wandb = accelerator.is_main_process and args.wandb_project + if use_wandb: + import wandb + log_config = vars(args) + log_config['config'] = config + log_config['parameters'] = K.utils.n_params(inner_model) + wandb.init(project=args.wandb_project, entity=args.wandb_entity, group=args.wandb_group, config=log_config, save_code=True) + + if opt_config['type'] == 'adamw': + opt = optim.AdamW(inner_model.parameters(), + lr=opt_config['lr'] if args.lr is None else args.lr, + betas=tuple(opt_config['betas']), + eps=opt_config['eps'], + weight_decay=opt_config['weight_decay']) + elif opt_config['type'] == 'sgd': + opt = optim.SGD(inner_model.parameters(), + lr=opt_config['lr'] if args.lr is None else args.lr, + momentum=opt_config.get('momentum', 0.), + nesterov=opt_config.get('nesterov', False), + weight_decay=opt_config.get('weight_decay', 0.)) + else: + raise ValueError('Invalid optimizer type') + + if sched_config['type'] == 'inverse': + sched = K.utils.InverseLR(opt, + inv_gamma=sched_config['inv_gamma'], + power=sched_config['power'], + warmup=sched_config['warmup']) + elif sched_config['type'] == 'exponential': + sched = K.utils.ExponentialLR(opt, + num_steps=sched_config['num_steps'], + decay=sched_config['decay'], + warmup=sched_config['warmup']) + elif sched_config['type'] == 'constant': + sched = optim.lr_scheduler.LambdaLR(opt, lambda _: 1.0) + else: + raise ValueError('Invalid schedule type') + + assert ema_sched_config['type'] == 'inverse' + ema_sched = K.utils.EMAWarmup(power=ema_sched_config['power'], + max_value=ema_sched_config['max_value']) + + tf = transforms.Compose([ + transforms.Resize(size[0], interpolation=transforms.InterpolationMode.LANCZOS), + transforms.CenterCrop(size[0]), + K.augmentation.KarrasAugmentationPipeline(model_config['augment_prob']), + ]) + + if dataset_config['type'] == 'imagefolder': + train_set = K.utils.FolderOfImages(dataset_config['location'], transform=tf) + elif dataset_config['type'] == 'cifar10': + train_set = datasets.CIFAR10(dataset_config['location'], train=True, download=True, transform=tf) + elif dataset_config['type'] == 'mnist': + train_set = datasets.MNIST(dataset_config['location'], train=True, download=True, transform=tf) + elif dataset_config['type'] == 'huggingface': + from datasets import load_dataset + train_set = load_dataset(dataset_config['location']) + train_set.set_transform(partial(K.utils.hf_datasets_augs_helper, transform=tf, image_key=dataset_config['image_key'])) + train_set = train_set['train'] + else: + raise ValueError('Invalid dataset type') + + if accelerator.is_main_process: + try: + print('Number of items in dataset:', len(train_set)) + except TypeError: + pass + + image_key = dataset_config.get('image_key', 0) + + train_dl = data.DataLoader(train_set, args.batch_size, shuffle=True, drop_last=True, + num_workers=args.num_workers, persistent_workers=True) + + if args.grow: + if not args.grow_config: + raise ValueError('--grow requires --grow-config') + ckpt = torch.load(args.grow, map_location='cpu') + old_config = K.config.load_config(open(args.grow_config)) + old_inner_model = K.config.make_model(old_config) + old_inner_model.load_state_dict(ckpt['model_ema']) + if old_config['model']['skip_stages'] != model_config['skip_stages']: + old_inner_model.set_skip_stages(model_config['skip_stages']) + if old_config['model']['patch_size'] != model_config['patch_size']: + old_inner_model.set_patch_size(model_config['patch_size']) + inner_model.load_state_dict(old_inner_model.state_dict()) + del ckpt, old_inner_model + + inner_model, inner_model_ema, opt, train_dl = accelerator.prepare(inner_model, inner_model_ema, opt, train_dl) + if use_wandb: + wandb.watch(inner_model) + if args.gns: + gns_stats_hook = K.gns.DDPGradientStatsHook(inner_model) + gns_stats = K.gns.GradientNoiseScale() + else: + gns_stats = None + sigma_min = model_config['sigma_min'] + sigma_max = model_config['sigma_max'] + sample_density = K.config.make_sample_density(model_config) + + model = K.config.make_denoiser_wrapper(config)(inner_model) + model_ema = K.config.make_denoiser_wrapper(config)(inner_model_ema) + + state_path = Path(f'{args.name}_state.json') + + if state_path.exists() or args.resume: + if args.resume: + ckpt_path = args.resume + if not args.resume: + state = json.load(open(state_path)) + ckpt_path = state['latest_checkpoint'] + if accelerator.is_main_process: + print(f'Resuming from {ckpt_path}...') + ckpt = torch.load(ckpt_path, map_location='cpu') + accelerator.unwrap_model(model.inner_model).load_state_dict(ckpt['model']) + accelerator.unwrap_model(model_ema.inner_model).load_state_dict(ckpt['model_ema']) + opt.load_state_dict(ckpt['opt']) + sched.load_state_dict(ckpt['sched']) + ema_sched.load_state_dict(ckpt['ema_sched']) + epoch = ckpt['epoch'] + 1 + step = ckpt['step'] + 1 + if args.gns and ckpt.get('gns_stats', None) is not None: + gns_stats.load_state_dict(ckpt['gns_stats']) + + del ckpt + else: + epoch = 0 + step = 0 + + evaluate_enabled = args.evaluate_every > 0 and args.evaluate_n > 0 + if evaluate_enabled: + extractor = K.evaluation.InceptionV3FeatureExtractor(device=device) + train_iter = iter(train_dl) + if accelerator.is_main_process: + print('Computing features for reals...') + reals_features = K.evaluation.compute_features(accelerator, lambda x: next(train_iter)[image_key][1], extractor, args.evaluate_n, args.batch_size) + if accelerator.is_main_process: + metrics_log = K.utils.CSVLogger(f'{args.name}_metrics.csv', ['step', 'fid', 'kid']) + del train_iter + + @torch.no_grad() + @K.utils.eval_mode(model_ema) + def demo(): + if accelerator.is_main_process: + tqdm.write('Sampling...') + filename = f'{args.name}_demo_{step:08}.png' + n_per_proc = math.ceil(args.sample_n / accelerator.num_processes) + x = torch.randn([n_per_proc, model_config['input_channels'], size[0], size[1]], device=device) * sigma_max + sigmas = K.sampling.get_sigmas_karras(50, sigma_min, sigma_max, rho=7., device=device) + x_0 = K.sampling.sample_dpmpp_2m(model_ema, x, sigmas, disable=not accelerator.is_main_process) + x_0 = accelerator.gather(x_0)[:args.sample_n] + if accelerator.is_main_process: + grid = utils.make_grid(x_0, nrow=math.ceil(args.sample_n ** 0.5), padding=0) + K.utils.to_pil_image(grid).save(filename) + if use_wandb: + wandb.log({'demo_grid': wandb.Image(filename)}, step=step) + + @torch.no_grad() + @K.utils.eval_mode(model_ema) + def evaluate(): + if not evaluate_enabled: + return + if accelerator.is_main_process: + tqdm.write('Evaluating...') + sigmas = K.sampling.get_sigmas_karras(50, sigma_min, sigma_max, rho=7., device=device) + def sample_fn(n): + x = torch.randn([n, model_config['input_channels'], size[0], size[1]], device=device) * sigma_max + x_0 = K.sampling.sample_dpmpp_2m(model_ema, x, sigmas, disable=True) + return x_0 + fakes_features = K.evaluation.compute_features(accelerator, sample_fn, extractor, args.evaluate_n, args.batch_size) + if accelerator.is_main_process: + fid = K.evaluation.fid(fakes_features, reals_features) + kid = K.evaluation.kid(fakes_features, reals_features) + print(f'FID: {fid.item():g}, KID: {kid.item():g}') + if accelerator.is_main_process: + metrics_log.write(step, fid.item(), kid.item()) + if use_wandb: + wandb.log({'FID': fid.item(), 'KID': kid.item()}, step=step) + + def save(): + accelerator.wait_for_everyone() + filename = f'{args.name}_{step:08}.pth' + if accelerator.is_main_process: + tqdm.write(f'Saving to {filename}...') + obj = { + 'model': accelerator.unwrap_model(model.inner_model).state_dict(), + 'model_ema': accelerator.unwrap_model(model_ema.inner_model).state_dict(), + 'opt': opt.state_dict(), + 'sched': sched.state_dict(), + 'ema_sched': ema_sched.state_dict(), + 'epoch': epoch, + 'step': step, + 'gns_stats': gns_stats.state_dict() if gns_stats is not None else None, + } + accelerator.save(obj, filename) + if accelerator.is_main_process: + state_obj = {'latest_checkpoint': filename} + json.dump(state_obj, open(state_path, 'w')) + if args.wandb_save_model and use_wandb: + wandb.save(filename) + + try: + while True: + for batch in tqdm(train_dl, disable=not accelerator.is_main_process): + with accelerator.accumulate(model): + reals, _, aug_cond = batch[image_key] + noise = torch.randn_like(reals) + sigma = sample_density([reals.shape[0]], device=device) + losses = model.loss(reals, noise, sigma, aug_cond=aug_cond) + losses_all = accelerator.gather(losses) + loss = losses_all.mean() + accelerator.backward(losses.mean()) + if args.gns: + sq_norm_small_batch, sq_norm_large_batch = gns_stats_hook.get_stats() + gns_stats.update(sq_norm_small_batch, sq_norm_large_batch, reals.shape[0], reals.shape[0] * accelerator.num_processes) + opt.step() + sched.step() + opt.zero_grad() + if accelerator.sync_gradients: + ema_decay = ema_sched.get_value() + K.utils.ema_update(model, model_ema, ema_decay) + ema_sched.step() + + if accelerator.is_main_process: + if step % 25 == 0: + if args.gns: + tqdm.write(f'Epoch: {epoch}, step: {step}, loss: {loss.item():g}, gns: {gns_stats.get_gns():g}') + else: + tqdm.write(f'Epoch: {epoch}, step: {step}, loss: {loss.item():g}') + + if use_wandb: + log_dict = { + 'epoch': epoch, + 'loss': loss.item(), + 'lr': sched.get_last_lr()[0], + 'ema_decay': ema_decay, + } + if args.gns: + log_dict['gradient_noise_scale'] = gns_stats.get_gns() + wandb.log(log_dict, step=step) + + if step % args.demo_every == 0: + demo() + + if evaluate_enabled and step > 0 and step % args.evaluate_every == 0: + evaluate() + + if step > 0 and step % args.save_every == 0: + save() + + step += 1 + epoch += 1 + except KeyboardInterrupt: + pass + + +if __name__ == '__main__': + main() diff --git a/repositories/stable-diffusion-stability-ai/.gitignore b/repositories/stable-diffusion-stability-ai/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..3b33f062894825a067995b8b9e8224ba4d9ff708 --- /dev/null +++ b/repositories/stable-diffusion-stability-ai/.gitignore @@ -0,0 +1,165 @@ +# Generated by project +outputs/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# General MacOS +.DS_Store +.AppleDouble +.LSOverride + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# IDEs +.idea/ +.vscode/ diff --git a/repositories/stable-diffusion-stability-ai/LICENSE b/repositories/stable-diffusion-stability-ai/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..58a49c99b2b9151af5e1fee0dbd20307671f47ab --- /dev/null +++ b/repositories/stable-diffusion-stability-ai/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Stability AI + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/repositories/stable-diffusion-stability-ai/LICENSE-MODEL b/repositories/stable-diffusion-stability-ai/LICENSE-MODEL new file mode 100644 index 0000000000000000000000000000000000000000..9684533d88e7d853a55cabf6caa2f1e4a3e6fdc4 --- /dev/null +++ b/repositories/stable-diffusion-stability-ai/LICENSE-MODEL @@ -0,0 +1,84 @@ +Copyright (c) 2022 Stability AI and contributors + +CreativeML Open RAIL++-M License +dated November 24, 2022 + +Section I: PREAMBLE + +Multimodal generative models are being widely adopted and used, and have the potential to transform the way artists, among other individuals, conceive and benefit from AI or ML technologies as a tool for content creation. + +Notwithstanding the current and potential benefits that these artifacts can bring to society at large, there are also concerns about potential misuses of them, either due to their technical limitations or ethical considerations. + +In short, this license strives for both the open and responsible downstream use of the accompanying model. When it comes to the open character, we took inspiration from open source permissive licenses regarding the grant of IP rights. Referring to the downstream responsible use, we added use-based restrictions not permitting the use of the Model in very specific scenarios, in order for the licensor to be able to enforce the license in case potential misuses of the Model may occur. At the same time, we strive to promote open and responsible research on generative models for art and content generation. + +Even though downstream derivative versions of the model could be released under different licensing terms, the latter will always have to include - at minimum - the same use-based restrictions as the ones in the original license (this license). We believe in the intersection between open and responsible AI development; thus, this License aims to strike a balance between both in order to enable responsible open-science in the field of AI. + +This License governs the use of the model (and its derivatives) and is informed by the model card associated with the model. + +NOW THEREFORE, You and Licensor agree as follows: + +1. Definitions + +- "License" means the terms and conditions for use, reproduction, and Distribution as defined in this document. +- "Data" means a collection of information and/or content extracted from the dataset used with the Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not licensed under this License. +- "Output" means the results of operating a Model as embodied in informational content resulting therefrom. +- "Model" means any accompanying machine-learning based assemblies (including checkpoints), consisting of learnt weights, parameters (including optimizer states), corresponding to the model architecture as embodied in the Complementary Material, that have been trained or tuned, in whole or in part on the Data, using the Complementary Material. +- "Derivatives of the Model" means all modifications to the Model, works based on the Model, or any other model which is created or initialized by transfer of patterns of the weights, parameters, activations or output of the Model, to the other model, in order to cause the other model to perform similarly to the Model, including - but not limited to - distillation methods entailing the use of intermediate data representations or methods based on the generation of synthetic data by the Model for training the other model. +- "Complementary Material" means the accompanying source code and scripts used to define, run, load, benchmark or evaluate the Model, and used to prepare data for training or evaluation, if any. This includes any accompanying documentation, tutorials, examples, etc, if any. +- "Distribution" means any transmission, reproduction, publication or other sharing of the Model or Derivatives of the Model to a third party, including providing the Model as a hosted service made available by electronic or other remote means - e.g. API-based or web access. +- "Licensor" means the copyright owner or entity authorized by the copyright owner that is granting the License, including the persons or entities that may have rights in the Model and/or distributing the Model. +- "You" (or "Your") means an individual or Legal Entity exercising permissions granted by this License and/or making use of the Model for whichever purpose and in any field of use, including usage of the Model in an end-use application - e.g. chatbot, translator, image generator. +- "Third Parties" means individuals or legal entities that are not under common control with Licensor or You. +- "Contribution" means any work of authorship, including the original version of the Model and any modifications or additions to that Model or Derivatives of the Model thereof, that is intentionally submitted to Licensor for inclusion in the Model by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Model, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." +- "Contributor" means Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Model. + +Section II: INTELLECTUAL PROPERTY RIGHTS + +Both copyright and patent grants apply to the Model, Derivatives of the Model and Complementary Material. The Model and Derivatives of the Model are subject to additional terms as described in Section III. + +2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare, publicly display, publicly perform, sublicense, and distribute the Complementary Material, the Model, and Derivatives of the Model. +3. Grant of Patent License. Subject to the terms and conditions of this License and where and as applicable, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this paragraph) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Model and the Complementary Material, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Model to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model and/or Complementary Material or a Contribution incorporated within the Model and/or Complementary Material constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for the Model and/or Work shall terminate as of the date such litigation is asserted or filed. + +Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION + +4. Distribution and Redistribution. You may host for Third Party remote access purposes (e.g. software-as-a-service), reproduce and distribute copies of the Model or Derivatives of the Model thereof in any medium, with or without modifications, provided that You meet the following conditions: +Use-based restrictions as referenced in paragraph 5 MUST be included as an enforceable provision by You in any type of legal agreement (e.g. a license) governing the use and/or distribution of the Model or Derivatives of the Model, and You shall give notice to subsequent users You Distribute to, that the Model or Derivatives of the Model are subject to paragraph 5. This provision does not apply to the use of Complementary Material. +You must give any Third Party recipients of the Model or Derivatives of the Model a copy of this License; +You must cause any modified files to carry prominent notices stating that You changed the files; +You must retain all copyright, patent, trademark, and attribution notices excluding those notices that do not pertain to any part of the Model, Derivatives of the Model. +You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions - respecting paragraph 4.a. - for use, reproduction, or Distribution of Your modifications, or for any such Derivatives of the Model as a whole, provided Your use, reproduction, and Distribution of the Model otherwise complies with the conditions stated in this License. +5. Use-based restrictions. The restrictions set forth in Attachment A are considered Use-based restrictions. Therefore You cannot use the Model and the Derivatives of the Model for the specified restricted uses. You may use the Model subject to this License, including only for lawful purposes and in accordance with the License. Use may include creating any content with, finetuning, updating, running, training, evaluating and/or reparametrizing the Model. You shall require all of Your users who use the Model or a Derivative of the Model to comply with the terms of this paragraph (paragraph 5). +6. The Output You Generate. Except as set forth herein, Licensor claims no rights in the Output You generate using the Model. You are accountable for the Output you generate and its subsequent uses. No use of the output can contravene any provision as stated in the License. + +Section IV: OTHER PROVISIONS + +7. Updates and Runtime Restrictions. To the maximum extent permitted by law, Licensor reserves the right to restrict (remotely or otherwise) usage of the Model in violation of this License. +8. Trademarks and related. Nothing in this License permits You to make use of Licensors’ trademarks, trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between the parties; and any rights not expressly granted herein are reserved by the Licensors. +9. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Model and the Complementary Material (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Model, Derivatives of the Model, and the Complementary Material and assume any risks associated with Your exercise of permissions under this License. +10. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Model and the Complementary Material (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. +11. Accepting Warranty or Additional Liability. While redistributing the Model, Derivatives of the Model and the Complementary Material thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. +12. If any provision of this License is held to be invalid, illegal or unenforceable, the remaining provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein. + +END OF TERMS AND CONDITIONS + + + + +Attachment A + +Use Restrictions + +You agree not to use the Model or Derivatives of the Model: + +- In any way that violates any applicable national, federal, state, local or international law or regulation; +- For the purpose of exploiting, harming or attempting to exploit or harm minors in any way; +- To generate or disseminate verifiably false information and/or content with the purpose of harming others; +- To generate or disseminate personal identifiable information that can be used to harm an individual; +- To defame, disparage or otherwise harass others; +- For fully automated decision making that adversely impacts an individual’s legal rights or otherwise creates or modifies a binding, enforceable obligation; +- For any use intended to or which has the effect of discriminating against or harming individuals or groups based on online or offline social behavior or known or predicted personal or personality characteristics; +- To exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm; +- For any use intended to or which has the effect of discriminating against individuals or groups based on legally protected characteristics or categories; +- To provide medical advice and medical results interpretation; +- To generate or disseminate information for the purpose to be used for administration of justice, law enforcement, immigration or asylum processes, such as predicting an individual will commit fraud/crime commitment (e.g. by text profiling, drawing causal relationships between assertions made in documents, indiscriminate and arbitrarily-targeted use). + diff --git a/repositories/stable-diffusion-stability-ai/README.md b/repositories/stable-diffusion-stability-ai/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2dfddca33cf71da343a937cf0fb5e32150f16fe3 --- /dev/null +++ b/repositories/stable-diffusion-stability-ai/README.md @@ -0,0 +1,302 @@ +# Stable Diffusion Version 2 +![t2i](assets/stable-samples/txt2img/768/merged-0006.png) +![t2i](assets/stable-samples/txt2img/768/merged-0002.png) +![t2i](assets/stable-samples/txt2img/768/merged-0005.png) + +This repository contains [Stable Diffusion](https://github.com/CompVis/stable-diffusion) models trained from scratch and will be continuously updated with +new checkpoints. The following list provides an overview of all currently available models. More coming soon. + +## News + + +**March 24, 2023** + +*Stable UnCLIP 2.1* + +- New stable diffusion finetune (_Stable unCLIP 2.1_, [Hugging Face](https://huggingface.co/stabilityai/)) at 768x768 resolution, based on SD2.1-768. This model allows for image variations and mixing operations as described in [*Hierarchical Text-Conditional Image Generation with CLIP Latents*](https://arxiv.org/abs/2204.06125), and, thanks to its modularity, can be combined with other models such as [KARLO](https://github.com/kakaobrain/karlo). Comes in two variants: [*Stable unCLIP-L*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-l.ckpt) and [*Stable unCLIP-H*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-h.ckpt), which are conditioned on CLIP ViT-L and ViT-H image embeddings, respectively. Instructions are available [here](doc/UNCLIP.MD). + +- A public demo of SD-unCLIP is already available at [clipdrop.co/stable-diffusion-reimagine](https://clipdrop.co/stable-diffusion-reimagine) + + +**December 7, 2022** + +*Version 2.1* + +- New stable diffusion model (_Stable Diffusion 2.1-v_, [Hugging Face](https://huggingface.co/stabilityai/stable-diffusion-2-1)) at 768x768 resolution and (_Stable Diffusion 2.1-base_, [HuggingFace](https://huggingface.co/stabilityai/stable-diffusion-2-1-base)) at 512x512 resolution, both based on the same number of parameters and architecture as 2.0 and fine-tuned on 2.0, on a less restrictive NSFW filtering of the [LAION-5B](https://laion.ai/blog/laion-5b/) dataset. +Per default, the attention operation of the model is evaluated at full precision when `xformers` is not installed. To enable fp16 (which can cause numerical instabilities with the vanilla attention module on the v2.1 model) , run your script with `ATTN_PRECISION=fp16 python ` + +**November 24, 2022** + +*Version 2.0* + +- New stable diffusion model (_Stable Diffusion 2.0-v_) at 768x768 resolution. Same number of parameters in the U-Net as 1.5, but uses [OpenCLIP-ViT/H](https://github.com/mlfoundations/open_clip) as the text encoder and is trained from scratch. _SD 2.0-v_ is a so-called [v-prediction](https://arxiv.org/abs/2202.00512) model. +- The above model is finetuned from _SD 2.0-base_, which was trained as a standard noise-prediction model on 512x512 images and is also made available. +- Added a [x4 upscaling latent text-guided diffusion model](#image-upscaling-with-stable-diffusion). +- New [depth-guided stable diffusion model](#depth-conditional-stable-diffusion), finetuned from _SD 2.0-base_. The model is conditioned on monocular depth estimates inferred via [MiDaS](https://github.com/isl-org/MiDaS) and can be used for structure-preserving img2img and shape-conditional synthesis. + + ![d2i](assets/stable-samples/depth2img/depth2img01.png) +- A [text-guided inpainting model](#image-inpainting-with-stable-diffusion), finetuned from SD _2.0-base_. + +We follow the [original repository](https://github.com/CompVis/stable-diffusion) and provide basic inference scripts to sample from the models. + +________________ +*The original Stable Diffusion model was created in a collaboration with [CompVis](https://arxiv.org/abs/2202.00512) and [RunwayML](https://runwayml.com/) and builds upon the work:* + +[**High-Resolution Image Synthesis with Latent Diffusion Models**](https://ommer-lab.com/research/latent-diffusion-models/)
+[Robin Rombach](https://github.com/rromb)\*, +[Andreas Blattmann](https://github.com/ablattmann)\*, +[Dominik Lorenz](https://github.com/qp-qp)\, +[Patrick Esser](https://github.com/pesser), +[Björn Ommer](https://hci.iwr.uni-heidelberg.de/Staff/bommer)
+_[CVPR '22 Oral](https://openaccess.thecvf.com/content/CVPR2022/html/Rombach_High-Resolution_Image_Synthesis_With_Latent_Diffusion_Models_CVPR_2022_paper.html) | +[GitHub](https://github.com/CompVis/latent-diffusion) | [arXiv](https://arxiv.org/abs/2112.10752) | [Project page](https://ommer-lab.com/research/latent-diffusion-models/)_ + +and [many others](#shout-outs). + +Stable Diffusion is a latent text-to-image diffusion model. +________________________________ + +## Requirements + +You can update an existing [latent diffusion](https://github.com/CompVis/latent-diffusion) environment by running + +``` +conda install pytorch==1.12.1 torchvision==0.13.1 -c pytorch +pip install transformers==4.19.2 diffusers invisible-watermark +pip install -e . +``` +#### xformers efficient attention +For more efficiency and speed on GPUs, +we highly recommended installing the [xformers](https://github.com/facebookresearch/xformers) +library. + +Tested on A100 with CUDA 11.4. +Installation needs a somewhat recent version of nvcc and gcc/g++, obtain those, e.g., via +```commandline +export CUDA_HOME=/usr/local/cuda-11.4 +conda install -c nvidia/label/cuda-11.4.0 cuda-nvcc +conda install -c conda-forge gcc +conda install -c conda-forge gxx_linux-64==9.5.0 +``` + +Then, run the following (compiling takes up to 30 min). + +```commandline +cd .. +git clone https://github.com/facebookresearch/xformers.git +cd xformers +git submodule update --init --recursive +pip install -r requirements.txt +pip install -e . +cd ../stablediffusion +``` +Upon successful installation, the code will automatically default to [memory efficient attention](https://github.com/facebookresearch/xformers) +for the self- and cross-attention layers in the U-Net and autoencoder. + +## General Disclaimer +Stable Diffusion models are general text-to-image diffusion models and therefore mirror biases and (mis-)conceptions that are present +in their training data. Although efforts were made to reduce the inclusion of explicit pornographic material, **we do not recommend using the provided weights for services or products without additional safety mechanisms and considerations. +The weights are research artifacts and should be treated as such.** +Details on the training procedure and data, as well as the intended use of the model can be found in the corresponding [model card](https://huggingface.co/stabilityai/stable-diffusion-2). +The weights are available via [the StabilityAI organization at Hugging Face](https://huggingface.co/StabilityAI) under the [CreativeML Open RAIL++-M License](LICENSE-MODEL). + + + +## Stable Diffusion v2 + +Stable Diffusion v2 refers to a specific configuration of the model +architecture that uses a downsampling-factor 8 autoencoder with an 865M UNet +and OpenCLIP ViT-H/14 text encoder for the diffusion model. The _SD 2-v_ model produces 768x768 px outputs. + +Evaluations with different classifier-free guidance scales (1.5, 2.0, 3.0, 4.0, +5.0, 6.0, 7.0, 8.0) and 50 DDIM sampling steps show the relative improvements of the checkpoints: + +![sd evaluation results](assets/model-variants.jpg) + + + +### Text-to-Image +![txt2img-stable2](assets/stable-samples/txt2img/merged-0003.png) +![txt2img-stable2](assets/stable-samples/txt2img/merged-0001.png) + +Stable Diffusion 2 is a latent diffusion model conditioned on the penultimate text embeddings of a CLIP ViT-H/14 text encoder. +We provide a [reference script for sampling](#reference-sampling-script). +#### Reference Sampling Script + +This script incorporates an [invisible watermarking](https://github.com/ShieldMnt/invisible-watermark) of the outputs, to help viewers [identify the images as machine-generated](scripts/tests/test_watermark.py). +We provide the configs for the _SD2-v_ (768px) and _SD2-base_ (512px) model. + +First, download the weights for [_SD2.1-v_](https://huggingface.co/stabilityai/stable-diffusion-2-1) and [_SD2.1-base_](https://huggingface.co/stabilityai/stable-diffusion-2-1-base). + +To sample from the _SD2.1-v_ model, run the following: + +``` +python scripts/txt2img.py --prompt "a professional photograph of an astronaut riding a horse" --ckpt --config configs/stable-diffusion/v2-inference-v.yaml --H 768 --W 768 +``` +or try out the Web Demo: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/stabilityai/stable-diffusion). + +To sample from the base model, use +``` +python scripts/txt2img.py --prompt "a professional photograph of an astronaut riding a horse" --ckpt --config +``` + +By default, this uses the [DDIM sampler](https://arxiv.org/abs/2010.02502), and renders images of size 768x768 (which it was trained on) in 50 steps. +Empirically, the v-models can be sampled with higher guidance scales. + +Note: The inference config for all model versions is designed to be used with EMA-only checkpoints. +For this reason `use_ema=False` is set in the configuration, otherwise the code will try to switch from +non-EMA to EMA weights. + +#### Enable Intel® Extension for PyTorch* optimizations in Text-to-Image script + +If you're planning on running Text-to-Image on Intel® CPU, try to sample an image with TorchScript and Intel® Extension for PyTorch* optimizations. Intel® Extension for PyTorch* extends PyTorch by enabling up-to-date features optimizations for an extra performance boost on Intel® hardware. It can optimize memory layout of the operators to Channel Last memory format, which is generally beneficial for Intel CPUs, take advantage of the most advanced instruction set available on a machine, optimize operators and many more. + +**Prerequisites** + +Before running the script, make sure you have all needed libraries installed. (the optimization was checked on `Ubuntu 20.04`). Install [jemalloc](https://github.com/jemalloc/jemalloc), [numactl](https://linux.die.net/man/8/numactl), Intel® OpenMP and Intel® Extension for PyTorch*. + +```bash +apt-get install numactl libjemalloc-dev +pip install intel-openmp +pip install intel_extension_for_pytorch -f https://software.intel.com/ipex-whl-stable +``` + +To sample from the _SD2.1-v_ model with TorchScript+IPEX optimizations, run the following. Remember to specify desired number of instances you want to run the program on ([more](https://github.com/intel/intel-extension-for-pytorch/blob/master/intel_extension_for_pytorch/cpu/launch.py#L48)). + +``` +MALLOC_CONF=oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000 python -m intel_extension_for_pytorch.cpu.launch --ninstance --enable_jemalloc scripts/txt2img.py --prompt \"a corgi is playing guitar, oil on canvas\" --ckpt --config configs/stable-diffusion/intel/v2-inference-v-fp32.yaml --H 768 --W 768 --precision full --device cpu --torchscript --ipex +``` + +To sample from the base model with IPEX optimizations, use + +``` +MALLOC_CONF=oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000 python -m intel_extension_for_pytorch.cpu.launch --ninstance --enable_jemalloc scripts/txt2img.py --prompt \"a corgi is playing guitar, oil on canvas\" --ckpt --config configs/stable-diffusion/intel/v2-inference-fp32.yaml --n_samples 1 --n_iter 4 --precision full --device cpu --torchscript --ipex +``` + +If you're using a CPU that supports `bfloat16`, consider sample from the model with bfloat16 enabled for a performance boost, like so + +```bash +# SD2.1-v +MALLOC_CONF=oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000 python -m intel_extension_for_pytorch.cpu.launch --ninstance --enable_jemalloc scripts/txt2img.py --prompt \"a corgi is playing guitar, oil on canvas\" --ckpt --config configs/stable-diffusion/intel/v2-inference-v-bf16.yaml --H 768 --W 768 --precision full --device cpu --torchscript --ipex --bf16 +# SD2.1-base +MALLOC_CONF=oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:9000000000,muzzy_decay_ms:9000000000 python -m intel_extension_for_pytorch.cpu.launch --ninstance --enable_jemalloc scripts/txt2img.py --prompt \"a corgi is playing guitar, oil on canvas\" --ckpt --config configs/stable-diffusion/intel/v2-inference-bf16.yaml --precision full --device cpu --torchscript --ipex --bf16 +``` + +### Image Modification with Stable Diffusion + +![depth2img-stable2](assets/stable-samples/depth2img/merged-0000.png) +#### Depth-Conditional Stable Diffusion + +To augment the well-established [img2img](https://github.com/CompVis/stable-diffusion#image-modification-with-stable-diffusion) functionality of Stable Diffusion, we provide a _shape-preserving_ stable diffusion model. + + +Note that the original method for image modification introduces significant semantic changes w.r.t. the initial image. +If that is not desired, download our [depth-conditional stable diffusion](https://huggingface.co/stabilityai/stable-diffusion-2-depth) model and the `dpt_hybrid` MiDaS [model weights](https://github.com/intel-isl/DPT/releases/download/1_0/dpt_hybrid-midas-501f0c75.pt), place the latter in a folder `midas_models` and sample via +``` +python scripts/gradio/depth2img.py configs/stable-diffusion/v2-midas-inference.yaml +``` + +or + +``` +streamlit run scripts/streamlit/depth2img.py configs/stable-diffusion/v2-midas-inference.yaml +``` + +This method can be used on the samples of the base model itself. +For example, take [this sample](assets/stable-samples/depth2img/old_man.png) generated by an anonymous discord user. +Using the [gradio](https://gradio.app) or [streamlit](https://streamlit.io/) script `depth2img.py`, the MiDaS model first infers a monocular depth estimate given this input, +and the diffusion model is then conditioned on the (relative) depth output. + +

+ depth2image
+ +

+ +This model is particularly useful for a photorealistic style; see the [examples](assets/stable-samples/depth2img). +For a maximum strength of 1.0, the model removes all pixel-based information and only relies on the text prompt and the inferred monocular depth estimate. + +![depth2img-stable3](assets/stable-samples/depth2img/merged-0005.png) + +#### Classic Img2Img + +For running the "classic" img2img, use +``` +python scripts/img2img.py --prompt "A fantasy landscape, trending on artstation" --init-img --strength 0.8 --ckpt +``` +and adapt the checkpoint and config paths accordingly. + +### Image Upscaling with Stable Diffusion +![upscaling-x4](assets/stable-samples/upscaling/merged-dog.png) +After [downloading the weights](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler), run +``` +python scripts/gradio/superresolution.py configs/stable-diffusion/x4-upscaling.yaml +``` + +or + +``` +streamlit run scripts/streamlit/superresolution.py -- configs/stable-diffusion/x4-upscaling.yaml +``` + +for a Gradio or Streamlit demo of the text-guided x4 superresolution model. +This model can be used both on real inputs and on synthesized examples. For the latter, we recommend setting a higher +`noise_level`, e.g. `noise_level=100`. + +### Image Inpainting with Stable Diffusion + +![inpainting-stable2](assets/stable-inpainting/merged-leopards.png) + +[Download the SD 2.0-inpainting checkpoint](https://huggingface.co/stabilityai/stable-diffusion-2-inpainting) and run + +``` +python scripts/gradio/inpainting.py configs/stable-diffusion/v2-inpainting-inference.yaml +``` + +or + +``` +streamlit run scripts/streamlit/inpainting.py -- configs/stable-diffusion/v2-inpainting-inference.yaml +``` + +for a Gradio or Streamlit demo of the inpainting model. +This scripts adds invisible watermarking to the demo in the [RunwayML](https://github.com/runwayml/stable-diffusion/blob/main/scripts/inpaint_st.py) repository, but both should work interchangeably with the checkpoints/configs. + + + +## Shout-Outs +- Thanks to [Hugging Face](https://huggingface.co/) and in particular [Apolinário](https://github.com/apolinario) for support with our model releases! +- Stable Diffusion would not be possible without [LAION](https://laion.ai/) and their efforts to create open, large-scale datasets. +- The [DeepFloyd team](https://twitter.com/deepfloydai) at Stability AI, for creating the subset of [LAION-5B](https://laion.ai/blog/laion-5b/) dataset used to train the model. +- Stable Diffusion 2.0 uses [OpenCLIP](https://laion.ai/blog/large-openclip/), trained by [Romain Beaumont](https://github.com/rom1504). +- Our codebase for the diffusion models builds heavily on [OpenAI's ADM codebase](https://github.com/openai/guided-diffusion) +and [https://github.com/lucidrains/denoising-diffusion-pytorch](https://github.com/lucidrains/denoising-diffusion-pytorch). +Thanks for open-sourcing! +- [CompVis](https://github.com/CompVis/stable-diffusion) initial stable diffusion release +- [Patrick](https://github.com/pesser)'s [implementation](https://github.com/runwayml/stable-diffusion/blob/main/scripts/inpaint_st.py) of the streamlit demo for inpainting. +- `img2img` is an application of [SDEdit](https://arxiv.org/abs/2108.01073) by [Chenlin Meng](https://cs.stanford.edu/~chenlin/) from the [Stanford AI Lab](https://cs.stanford.edu/~ermon/website/). +- [Kat's implementation]((https://github.com/CompVis/latent-diffusion/pull/51)) of the [PLMS](https://arxiv.org/abs/2202.09778) sampler, and [more](https://github.com/crowsonkb/k-diffusion). +- [DPMSolver](https://arxiv.org/abs/2206.00927) [integration](https://github.com/CompVis/stable-diffusion/pull/440) by [Cheng Lu](https://github.com/LuChengTHU). +- Facebook's [xformers](https://github.com/facebookresearch/xformers) for efficient attention computation. +- [MiDaS](https://github.com/isl-org/MiDaS) for monocular depth estimation. + + +## License + +The code in this repository is released under the MIT License. + +The weights are available via [the StabilityAI organization at Hugging Face](https://huggingface.co/StabilityAI), and released under the [CreativeML Open RAIL++-M License](LICENSE-MODEL) License. + +## BibTeX + +``` +@misc{rombach2021highresolution, + title={High-Resolution Image Synthesis with Latent Diffusion Models}, + author={Robin Rombach and Andreas Blattmann and Dominik Lorenz and Patrick Esser and Björn Ommer}, + year={2021}, + eprint={2112.10752}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + + diff --git a/repositories/stable-diffusion-stability-ai/assets/model-variants.jpg b/repositories/stable-diffusion-stability-ai/assets/model-variants.jpg new file mode 100644 index 0000000000000000000000000000000000000000..de5bb3fc8af5d60d0c0fac8c0c866e9f8d857ba3 Binary files /dev/null and b/repositories/stable-diffusion-stability-ai/assets/model-variants.jpg differ diff --git a/repositories/stable-diffusion-stability-ai/assets/modelfigure.png b/repositories/stable-diffusion-stability-ai/assets/modelfigure.png new file mode 100644 index 0000000000000000000000000000000000000000..6b1d3e6b9d59fd8d38468e7bce47c903a4e1c932 Binary files /dev/null and b/repositories/stable-diffusion-stability-ai/assets/modelfigure.png differ diff --git a/repositories/stable-diffusion-stability-ai/assets/rick.jpeg b/repositories/stable-diffusion-stability-ai/assets/rick.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..995486061ba50bd0ae2e213c72de87a27326632f Binary files /dev/null and b/repositories/stable-diffusion-stability-ai/assets/rick.jpeg differ diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-inpainting/inpainting.gif b/repositories/stable-diffusion-stability-ai/assets/stable-inpainting/inpainting.gif new file mode 100644 index 0000000000000000000000000000000000000000..4f511347bc057f858db95015725488560ec1f676 Binary files /dev/null and b/repositories/stable-diffusion-stability-ai/assets/stable-inpainting/inpainting.gif differ diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-inpainting/merged-leopards.png b/repositories/stable-diffusion-stability-ai/assets/stable-inpainting/merged-leopards.png new file mode 100644 index 0000000000000000000000000000000000000000..b877158fff2a15d4016135ce2307e19ece877b59 --- /dev/null +++ b/repositories/stable-diffusion-stability-ai/assets/stable-inpainting/merged-leopards.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94a05d717a340d7b240283e72e91984e82093750ba066aa05ab0759188467e69 +size 4958974 diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/d2i.gif b/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/d2i.gif new file mode 100644 index 0000000000000000000000000000000000000000..265c1d1ae56feb60c5aabbf88526a6ed47fda383 --- /dev/null +++ b/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/d2i.gif @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7236fc7f4349740c537ef9c8730590c15d198aaf42925a46755ded26bc436bc4 +size 1140400 diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/depth2fantasy.jpeg b/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/depth2fantasy.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..8c0c94ee7354cc19e4c86134900397c1ed13eb01 Binary files /dev/null and b/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/depth2fantasy.jpeg differ diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/depth2img01.png b/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/depth2img01.png new file mode 100644 index 0000000000000000000000000000000000000000..f062b45a32bf71ce830b4ab2401475d421417d03 --- /dev/null +++ b/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/depth2img01.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60cb68c273602eae8e2fb769a2848e55844d812196260ada112a9aecc604f735 +size 3324111 diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/depth2img02.png b/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/depth2img02.png new file mode 100644 index 0000000000000000000000000000000000000000..f91cad5b401955221d8875529578504762ca6bfc --- /dev/null +++ b/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/depth2img02.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adabff92594b17a2554408257d2ab61eb9b89270d5917eafd44a9b75740aab04 +size 1775470 diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/merged-0000.png b/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/merged-0000.png new file mode 100644 index 0000000000000000000000000000000000000000..fce6c7a0b76bd34a2028a20efa5e5a497953cc7e --- /dev/null +++ b/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/merged-0000.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b19da6052f01a3b115ac3315ef5db1b7dcdb58091879c0dfe3895a7765a491ac +size 2129264 diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/merged-0004.png b/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/merged-0004.png new file mode 100644 index 0000000000000000000000000000000000000000..41641d3e2d07575fb5fa0e87609770ab1bae8126 --- /dev/null +++ b/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/merged-0004.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d3ae25218f696375aa953e55d12d034da3bd7abce68616a3af916bdae01cc86 +size 1448945 diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/merged-0005.png b/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/merged-0005.png new file mode 100644 index 0000000000000000000000000000000000000000..46634965fdf77e2134e2ac574fe02c12cd34d96e --- /dev/null +++ b/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/merged-0005.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:935252e202a3a6cafa476443f3a0ae3ac95cf85c37c0133f4f32af2aafb8f9ab +size 4546641 diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/midas.jpeg b/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/midas.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..4890ab7c6ca378f27dcf1eef6012848798713c94 Binary files /dev/null and b/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/midas.jpeg differ diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/old_man.png b/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/old_man.png new file mode 100644 index 0000000000000000000000000000000000000000..44bbc0051b5f12743b1c7ef92f23ed5758eed4e2 Binary files /dev/null and b/repositories/stable-diffusion-stability-ai/assets/stable-samples/depth2img/old_man.png differ diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/mountains-1.png b/repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/mountains-1.png new file mode 100644 index 0000000000000000000000000000000000000000..d01b8350743e3bd4fdf653d1563ee7d5c2153323 Binary files /dev/null and b/repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/mountains-1.png differ diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/mountains-2.png b/repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/mountains-2.png new file mode 100644 index 0000000000000000000000000000000000000000..e9f4e708535f0e5b53372a3a39e6aa31dce383fd Binary files /dev/null and b/repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/mountains-2.png differ diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/mountains-3.png b/repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/mountains-3.png new file mode 100644 index 0000000000000000000000000000000000000000..017de3012c2f03e4f87cce21b4d3342713b9ae95 Binary files /dev/null and b/repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/mountains-3.png differ diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/sketch-mountains-input.jpg b/repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/sketch-mountains-input.jpg new file mode 100644 index 0000000000000000000000000000000000000000..79d652b8003bbcd1d0c0ba2d984dbbe299ac5916 Binary files /dev/null and b/repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/sketch-mountains-input.jpg differ diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/upscaling-in.png b/repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/upscaling-in.png new file mode 100644 index 0000000000000000000000000000000000000000..6a16bf53a95850a4eb7730105cea61af7c435c5e --- /dev/null +++ b/repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/upscaling-in.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16e043b62bdfcc5be7d0eca5c747878b78e4e6ffaeb3cd1257568cbc2b5e6f7a +size 1167237 diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/upscaling-out.png b/repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/upscaling-out.png new file mode 100644 index 0000000000000000000000000000000000000000..b7926bc81099736f7c8df32cadc4481c07eddbd6 --- /dev/null +++ b/repositories/stable-diffusion-stability-ai/assets/stable-samples/img2img/upscaling-out.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c171218814d141f87884672cb00ae07c3ed0e14ce7f7023f2041678e01d93f59 +size 1317941 diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/houses_out.jpeg b/repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/houses_out.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..b4b8fb9e9cb3087d02b4ebac9591dbc9d44fcd50 Binary files /dev/null and b/repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/houses_out.jpeg differ diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/oldcar000.jpeg b/repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/oldcar000.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..d61c85e0d832a2fa9592d375bf5a1ca1d9bcab09 Binary files /dev/null and b/repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/oldcar000.jpeg differ diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/oldcar500.jpeg b/repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/oldcar500.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..175d2825101fd64f5a11456c2f86e15f1b281a83 Binary files /dev/null and b/repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/oldcar500.jpeg differ diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/oldcar800.jpeg b/repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/oldcar800.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..d4d91130c0fb4a97389b27cdc8824c0df4d9c485 Binary files /dev/null and b/repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/oldcar800.jpeg differ diff --git a/repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/panda.jpg b/repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/panda.jpg new file mode 100644 index 0000000000000000000000000000000000000000..49aa1ba4d5a818df96988f3dc2ade49200c767ab Binary files /dev/null and b/repositories/stable-diffusion-stability-ai/assets/stable-samples/stable-unclip/panda.jpg differ