diffuse-custom

Runtime error

App Files Files Community

diffuse-custom / diffusers /models /vae.py

Jackflack09

Duplicate from YeOldHermit/Super-Resolution-Anime-Diffusion

522606a over 1 year ago

raw

history blame

23.6 kB

	# Copyright 2022 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	from dataclasses import dataclass
	from typing import Optional, Tuple, Union

	import numpy as np
	import torch
	import torch.nn as nn

	from ..configuration_utils import ConfigMixin, register_to_config
	from ..modeling_utils import ModelMixin
	from ..utils import BaseOutput
	from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block


	@dataclass
	class DecoderOutput(BaseOutput):
	"""
	Output of decoding method.

	Args:
	sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
	Decoded output sample of the model. Output of the last layer of the model.
	"""

	sample: torch.FloatTensor


	@dataclass
	class VQEncoderOutput(BaseOutput):
	"""
	Output of VQModel encoding method.

	Args:
	latents (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
	Encoded output sample of the model. Output of the last layer of the model.
	"""

	latents: torch.FloatTensor


	@dataclass
	class AutoencoderKLOutput(BaseOutput):
	"""
	Output of AutoencoderKL encoding method.

	Args:
	latent_dist (`DiagonalGaussianDistribution`):
	Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`.
	`DiagonalGaussianDistribution` allows for sampling latents from the distribution.
	"""

	latent_dist: "DiagonalGaussianDistribution"


	class Encoder(nn.Module):
	def __init__(
	self,
	in_channels=3,
	out_channels=3,
	down_block_types=("DownEncoderBlock2D",),
	block_out_channels=(64,),
	layers_per_block=2,
	norm_num_groups=32,
	act_fn="silu",
	double_z=True,
	):
	super().__init__()
	self.layers_per_block = layers_per_block

	self.conv_in = torch.nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1)

	self.mid_block = None
	self.down_blocks = nn.ModuleList([])

	# down
	output_channel = block_out_channels[0]
	for i, down_block_type in enumerate(down_block_types):
	input_channel = output_channel
	output_channel = block_out_channels[i]
	is_final_block = i == len(block_out_channels) - 1

	down_block = get_down_block(
	down_block_type,
	num_layers=self.layers_per_block,
	in_channels=input_channel,
	out_channels=output_channel,
	add_downsample=not is_final_block,
	resnet_eps=1e-6,
	downsample_padding=0,
	resnet_act_fn=act_fn,
	resnet_groups=norm_num_groups,
	attn_num_head_channels=None,
	temb_channels=None,
	)
	self.down_blocks.append(down_block)

	# mid
	self.mid_block = UNetMidBlock2D(
	in_channels=block_out_channels[-1],
	resnet_eps=1e-6,
	resnet_act_fn=act_fn,
	output_scale_factor=1,
	resnet_time_scale_shift="default",
	attn_num_head_channels=None,
	resnet_groups=norm_num_groups,
	temb_channels=None,
	)

	# out
	self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
	self.conv_act = nn.SiLU()

	conv_out_channels = 2 * out_channels if double_z else out_channels
	self.conv_out = nn.Conv2d(block_out_channels[-1], conv_out_channels, 3, padding=1)

	def forward(self, x):
	sample = x
	sample = self.conv_in(sample)

	# down
	for down_block in self.down_blocks:
	sample = down_block(sample)

	# middle
	sample = self.mid_block(sample)

	# post-process
	sample = self.conv_norm_out(sample)
	sample = self.conv_act(sample)
	sample = self.conv_out(sample)

	return sample


	class Decoder(nn.Module):
	def __init__(
	self,
	in_channels=3,
	out_channels=3,
	up_block_types=("UpDecoderBlock2D",),
	block_out_channels=(64,),
	layers_per_block=2,
	norm_num_groups=32,
	act_fn="silu",
	):
	super().__init__()
	self.layers_per_block = layers_per_block

	self.conv_in = nn.Conv2d(in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1)

	self.mid_block = None
	self.up_blocks = nn.ModuleList([])

	# mid
	self.mid_block = UNetMidBlock2D(
	in_channels=block_out_channels[-1],
	resnet_eps=1e-6,
	resnet_act_fn=act_fn,
	output_scale_factor=1,
	resnet_time_scale_shift="default",
	attn_num_head_channels=None,
	resnet_groups=norm_num_groups,
	temb_channels=None,
	)

	# up
	reversed_block_out_channels = list(reversed(block_out_channels))
	output_channel = reversed_block_out_channels[0]
	for i, up_block_type in enumerate(up_block_types):
	prev_output_channel = output_channel
	output_channel = reversed_block_out_channels[i]

	is_final_block = i == len(block_out_channels) - 1

	up_block = get_up_block(
	up_block_type,
	num_layers=self.layers_per_block + 1,
	in_channels=prev_output_channel,
	out_channels=output_channel,
	prev_output_channel=None,
	add_upsample=not is_final_block,
	resnet_eps=1e-6,
	resnet_act_fn=act_fn,
	resnet_groups=norm_num_groups,
	attn_num_head_channels=None,
	temb_channels=None,
	)
	self.up_blocks.append(up_block)
	prev_output_channel = output_channel

	# out
	self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
	self.conv_act = nn.SiLU()
	self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)

	def forward(self, z):
	sample = z
	sample = self.conv_in(sample)

	# middle
	sample = self.mid_block(sample)

	# up
	for up_block in self.up_blocks:
	sample = up_block(sample)

	# post-process
	sample = self.conv_norm_out(sample)
	sample = self.conv_act(sample)
	sample = self.conv_out(sample)

	return sample


	class VectorQuantizer(nn.Module):
	"""
	Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly avoids costly matrix
	multiplications and allows for post-hoc remapping of indices.
	"""

	# NOTE: due to a bug the beta term was applied to the wrong term. for
	# backwards compatibility we use the buggy version by default, but you can
	# specify legacy=False to fix it.
	def __init__(
	self, n_e, vq_embed_dim, beta, remap=None, unknown_index="random", sane_index_shape=False, legacy=True
	):
	super().__init__()
	self.n_e = n_e
	self.vq_embed_dim = vq_embed_dim
	self.beta = beta
	self.legacy = legacy

	self.embedding = nn.Embedding(self.n_e, self.vq_embed_dim)
	self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)

	self.remap = remap
	if self.remap is not None:
	self.register_buffer("used", torch.tensor(np.load(self.remap)))
	self.re_embed = self.used.shape[0]
	self.unknown_index = unknown_index # "random" or "extra" or integer
	if self.unknown_index == "extra":
	self.unknown_index = self.re_embed
	self.re_embed = self.re_embed + 1
	print(
	f"Remapping {self.n_e} indices to {self.re_embed} indices. "
	f"Using {self.unknown_index} for unknown indices."
	)
	else:
	self.re_embed = n_e

	self.sane_index_shape = sane_index_shape

	def remap_to_used(self, inds):
	ishape = inds.shape
	assert len(ishape) > 1
	inds = inds.reshape(ishape[0], -1)
	used = self.used.to(inds)
	match = (inds[:, :, None] == used[None, None, ...]).long()
	new = match.argmax(-1)
	unknown = match.sum(2) < 1
	if self.unknown_index == "random":
	new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(device=new.device)
	else:
	new[unknown] = self.unknown_index
	return new.reshape(ishape)

	def unmap_to_all(self, inds):
	ishape = inds.shape
	assert len(ishape) > 1
	inds = inds.reshape(ishape[0], -1)
	used = self.used.to(inds)
	if self.re_embed > self.used.shape[0]: # extra token
	inds[inds >= self.used.shape[0]] = 0 # simply set to zero
	back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds)
	return back.reshape(ishape)

	def forward(self, z):
	# reshape z -> (batch, height, width, channel) and flatten
	z = z.permute(0, 2, 3, 1).contiguous()
	z_flattened = z.view(-1, self.vq_embed_dim)
	# distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z

	d = (
	torch.sum(z_flattened**2, dim=1, keepdim=True)
	+ torch.sum(self.embedding.weight**2, dim=1)
	- 2 * torch.einsum("bd,dn->bn", z_flattened, self.embedding.weight.t())
	)

	min_encoding_indices = torch.argmin(d, dim=1)
	z_q = self.embedding(min_encoding_indices).view(z.shape)
	perplexity = None
	min_encodings = None

	# compute loss for embedding
	if not self.legacy:
	loss = self.beta * torch.mean((z_q.detach() - z) 2) + torch.mean((z_q - z.detach()) 2)
	else:
	loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean((z_q - z.detach()) ** 2)

	# preserve gradients
	z_q = z + (z_q - z).detach()

	# reshape back to match original input shape
	z_q = z_q.permute(0, 3, 1, 2).contiguous()

	if self.remap is not None:
	min_encoding_indices = min_encoding_indices.reshape(z.shape[0], -1) # add batch axis
	min_encoding_indices = self.remap_to_used(min_encoding_indices)
	min_encoding_indices = min_encoding_indices.reshape(-1, 1) # flatten

	if self.sane_index_shape:
	min_encoding_indices = min_encoding_indices.reshape(z_q.shape[0], z_q.shape[2], z_q.shape[3])

	return z_q, loss, (perplexity, min_encodings, min_encoding_indices)

	def get_codebook_entry(self, indices, shape):
	# shape specifying (batch, height, width, channel)
	if self.remap is not None:
	indices = indices.reshape(shape[0], -1) # add batch axis
	indices = self.unmap_to_all(indices)
	indices = indices.reshape(-1) # flatten again

	# get quantized latent vectors
	z_q = self.embedding(indices)

	if shape is not None:
	z_q = z_q.view(shape)
	# reshape back to match original input shape
	z_q = z_q.permute(0, 3, 1, 2).contiguous()

	return z_q


	class DiagonalGaussianDistribution(object):
	def __init__(self, parameters, deterministic=False):
	self.parameters = parameters
	self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
	self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
	self.deterministic = deterministic
	self.std = torch.exp(0.5 * self.logvar)
	self.var = torch.exp(self.logvar)
	if self.deterministic:
	self.var = self.std = torch.zeros_like(
	self.mean, device=self.parameters.device, dtype=self.parameters.dtype
	)

	def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor:
	device = self.parameters.device
	sample_device = "cpu" if device.type == "mps" else device
	sample = torch.randn(self.mean.shape, generator=generator, device=sample_device)
	# make sure sample is on the same device as the parameters and has same dtype
	sample = sample.to(device=device, dtype=self.parameters.dtype)
	x = self.mean + self.std * sample
	return x

	def kl(self, other=None):
	if self.deterministic:
	return torch.Tensor([0.0])
	else:
	if other is None:
	return 0.5 * torch.sum(torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, dim=[1, 2, 3])
	else:
	return 0.5 * torch.sum(
	torch.pow(self.mean - other.mean, 2) / other.var
	+ self.var / other.var
	- 1.0
	- self.logvar
	+ other.logvar,
	dim=[1, 2, 3],
	)

	def nll(self, sample, dims=[1, 2, 3]):
	if self.deterministic:
	return torch.Tensor([0.0])
	logtwopi = np.log(2.0 * np.pi)
	return 0.5 * torch.sum(logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, dim=dims)

	def mode(self):
	return self.mean


	class VQModel(ModelMixin, ConfigMixin):
	r"""VQ-VAE model from the paper Neural Discrete Representation Learning by Aaron van den Oord, Oriol Vinyals and Koray
	Kavukcuoglu.

	This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
	implements for all the model (such as downloading or saving, etc.)

	Parameters:
	in_channels (int, optional, defaults to 3): Number of channels in the input image.
	out_channels (int, optional, defaults to 3): Number of channels in the output.
	down_block_types (`Tuple[str]`, optional, defaults to :
	obj:`("DownEncoderBlock2D",)`): Tuple of downsample block types.
	up_block_types (`Tuple[str]`, optional, defaults to :
	obj:`("UpDecoderBlock2D",)`): Tuple of upsample block types.
	block_out_channels (`Tuple[int]`, optional, defaults to :
	obj:`(64,)`): Tuple of block output channels.
	act_fn (`str`, optional, defaults to `"silu"`): The activation function to use.
	latent_channels (`int`, optional, defaults to `3`): Number of channels in the latent space.
	sample_size (`int`, optional, defaults to `32`): TODO
	num_vq_embeddings (`int`, optional, defaults to `256`): Number of codebook vectors in the VQ-VAE.
	vq_embed_dim (`int`, optional): Hidden dim of codebook vectors in the VQ-VAE.
	"""

	@register_to_config
	def __init__(
	self,
	in_channels: int = 3,
	out_channels: int = 3,
	down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
	up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
	block_out_channels: Tuple[int] = (64,),
	layers_per_block: int = 1,
	act_fn: str = "silu",
	latent_channels: int = 3,
	sample_size: int = 32,
	num_vq_embeddings: int = 256,
	norm_num_groups: int = 32,
	vq_embed_dim: Optional[int] = None,
	):
	super().__init__()

	# pass init params to Encoder
	self.encoder = Encoder(
	in_channels=in_channels,
	out_channels=latent_channels,
	down_block_types=down_block_types,
	block_out_channels=block_out_channels,
	layers_per_block=layers_per_block,
	act_fn=act_fn,
	norm_num_groups=norm_num_groups,
	double_z=False,
	)

	vq_embed_dim = vq_embed_dim if vq_embed_dim is not None else latent_channels

	self.quant_conv = torch.nn.Conv2d(latent_channels, vq_embed_dim, 1)
	self.quantize = VectorQuantizer(num_vq_embeddings, vq_embed_dim, beta=0.25, remap=None, sane_index_shape=False)
	self.post_quant_conv = torch.nn.Conv2d(vq_embed_dim, latent_channels, 1)

	# pass init params to Decoder
	self.decoder = Decoder(
	in_channels=latent_channels,
	out_channels=out_channels,
	up_block_types=up_block_types,
	block_out_channels=block_out_channels,
	layers_per_block=layers_per_block,
	act_fn=act_fn,
	norm_num_groups=norm_num_groups,
	)

	def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput:
	h = self.encoder(x)
	h = self.quant_conv(h)

	if not return_dict:
	return (h,)

	return VQEncoderOutput(latents=h)

	def decode(
	self, h: torch.FloatTensor, force_not_quantize: bool = False, return_dict: bool = True
	) -> Union[DecoderOutput, torch.FloatTensor]:
	# also go through quantization layer
	if not force_not_quantize:
	quant, emb_loss, info = self.quantize(h)
	else:
	quant = h
	quant = self.post_quant_conv(quant)
	dec = self.decoder(quant)

	if not return_dict:
	return (dec,)

	return DecoderOutput(sample=dec)

	def forward(self, sample: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
	r"""
	Args:
	sample (`torch.FloatTensor`): Input sample.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
	"""
	x = sample
	h = self.encode(x).latents
	dec = self.decode(h).sample

	if not return_dict:
	return (dec,)

	return DecoderOutput(sample=dec)


	class AutoencoderKL(ModelMixin, ConfigMixin):
	r"""Variational Autoencoder (VAE) model with KL loss from the paper Auto-Encoding Variational Bayes by Diederik P. Kingma
	and Max Welling.

	This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
	implements for all the model (such as downloading or saving, etc.)

	Parameters:
	in_channels (int, optional, defaults to 3): Number of channels in the input image.
	out_channels (int, optional, defaults to 3): Number of channels in the output.
	down_block_types (`Tuple[str]`, optional, defaults to :
	obj:`("DownEncoderBlock2D",)`): Tuple of downsample block types.
	up_block_types (`Tuple[str]`, optional, defaults to :
	obj:`("UpDecoderBlock2D",)`): Tuple of upsample block types.
	block_out_channels (`Tuple[int]`, optional, defaults to :
	obj:`(64,)`): Tuple of block output channels.
	act_fn (`str`, optional, defaults to `"silu"`): The activation function to use.
	latent_channels (`int`, optional, defaults to `4`): Number of channels in the latent space.
	sample_size (`int`, optional, defaults to `32`): TODO
	"""

	@register_to_config
	def __init__(
	self,
	in_channels: int = 3,
	out_channels: int = 3,
	down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
	up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
	block_out_channels: Tuple[int] = (64,),
	layers_per_block: int = 1,
	act_fn: str = "silu",
	latent_channels: int = 4,
	norm_num_groups: int = 32,
	sample_size: int = 32,
	):
	super().__init__()

	# pass init params to Encoder
	self.encoder = Encoder(
	in_channels=in_channels,
	out_channels=latent_channels,
	down_block_types=down_block_types,
	block_out_channels=block_out_channels,
	layers_per_block=layers_per_block,
	act_fn=act_fn,
	norm_num_groups=norm_num_groups,
	double_z=True,
	)

	# pass init params to Decoder
	self.decoder = Decoder(
	in_channels=latent_channels,
	out_channels=out_channels,
	up_block_types=up_block_types,
	block_out_channels=block_out_channels,
	layers_per_block=layers_per_block,
	norm_num_groups=norm_num_groups,
	act_fn=act_fn,
	)

	self.quant_conv = torch.nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
	self.post_quant_conv = torch.nn.Conv2d(latent_channels, latent_channels, 1)
	self.use_slicing = False

	def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
	h = self.encoder(x)
	moments = self.quant_conv(h)
	posterior = DiagonalGaussianDistribution(moments)

	if not return_dict:
	return (posterior,)

	return AutoencoderKLOutput(latent_dist=posterior)

	def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
	z = self.post_quant_conv(z)
	dec = self.decoder(z)

	if not return_dict:
	return (dec,)

	return DecoderOutput(sample=dec)

	def enable_slicing(self):
	r"""
	Enable sliced VAE decoding.

	When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
	steps. This is useful to save some memory and allow larger batch sizes.
	"""
	self.use_slicing = True

	def disable_slicing(self):
	r"""
	Disable sliced VAE decoding. If `enable_slicing` was previously invoked, this method will go back to computing
	decoding in one step.
	"""
	self.use_slicing = False

	def decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
	if self.use_slicing and z.shape[0] > 1:
	decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
	decoded = torch.cat(decoded_slices)
	else:
	decoded = self._decode(z).sample

	if not return_dict:
	return (decoded,)

	return DecoderOutput(sample=decoded)

	def forward(
	self,
	sample: torch.FloatTensor,
	sample_posterior: bool = False,
	return_dict: bool = True,
	generator: Optional[torch.Generator] = None,
	) -> Union[DecoderOutput, torch.FloatTensor]:
	r"""
	Args:
	sample (`torch.FloatTensor`): Input sample.
	sample_posterior (`bool`, optional, defaults to `False`):
	Whether to sample from the posterior.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
	"""
	x = sample
	posterior = self.encode(x).latent_dist
	if sample_posterior:
	z = posterior.sample(generator=generator)
	else:
	z = posterior.mode()
	dec = self.decode(z).sample

	if not return_dict:
	return (dec,)

	return DecoderOutput(sample=dec)