Spaces:

Esmail-AGumaan
/

nanograd-engine

Runtime error

App Files Files Community

Esmail-AGumaan commited on Sep 4

Commit

64e1ee8

•

1 Parent(s): 33e1004

Upload 13 files

Browse files

Files changed (13) hide show

attention.py +77 -0
clip.py +64 -0
ddpm.py +112 -0
decoder.py +100 -0
demo.ipynb +0 -0
diffusion.py +213 -0
encoder.py +56 -0
gaussing_diffusion.py +814 -0
model_converter.py +0 -0
model_loader.py +28 -0
pipeline.py +141 -0
sd_gradio.py +69 -0
sd_inference.py +72 -0

attention.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+import math
+class SelfAttention(nn.Module):
+    def __init__(self, n_heads, d_embed, in_proj_bias=True, out_proj_bias=True):
+        super().__init__()
+        self.in_proj = nn.Linear(d_embed, 3 * d_embed, bias=in_proj_bias)
+        self.out_proj = nn.Linear(d_embed, d_embed, bias=out_proj_bias)
+        self.n_heads = n_heads
+        self.d_head = d_embed // n_heads
+    def forward(self, x, causal_mask=False):
+        input_shape = x.shape
+        batch_size, sequence_length, d_embed = input_shape
+        interim_shape = (batch_size, sequence_length, self.n_heads, self.d_head)
+        q, k, v = self.in_proj(x).chunk(3, dim=-1)
+        q = q.view(interim_shape).transpose(1, 2)
+        k = k.view(interim_shape).transpose(1, 2)
+        v = v.view(interim_shape).transpose(1, 2)
+        weight = q @ k.transpose(-1, -2)
+        if causal_mask:
+            mask = torch.ones_like(weight, dtype=torch.bool).triu(1)
+            weight.masked_fill_(mask, -torch.inf)
+        weight /= math.sqrt(self.d_head)
+        weight = F.softmax(weight, dim=-1)
+        output = weight @ v
+        output = output.transpose(1, 2)
+        output = output.reshape(input_shape)
+        output = self.out_proj(output)
+        return output
+class CrossAttention(nn.Module):
+    def __init__(self, n_heads, d_embed, d_cross, in_proj_bias=True, out_proj_bias=True):
+        super().__init__()
+        self.q_proj   = nn.Linear(d_embed, d_embed, bias=in_proj_bias)
+        self.k_proj   = nn.Linear(d_cross, d_embed, bias=in_proj_bias)
+        self.v_proj   = nn.Linear(d_cross, d_embed, bias=in_proj_bias)
+        self.out_proj = nn.Linear(d_embed, d_embed, bias=out_proj_bias)
+        self.n_heads = n_heads
+        self.d_head = d_embed // n_heads
+    def forward(self, x, y):
+        input_shape = x.shape
+        batch_size, sequence_length, d_embed = input_shape
+        interim_shape = (batch_size, -1, self.n_heads, self.d_head)
+        q = self.q_proj(x)
+        k = self.k_proj(y)
+        v = self.v_proj(y)
+        q = q.view(interim_shape).transpose(1, 2)
+        k = k.view(interim_shape).transpose(1, 2)
+        v = v.view(interim_shape).transpose(1, 2)
+        weight = q @ k.transpose(-1, -2)
+        weight /= math.sqrt(self.d_head)
+        weight = F.softmax(weight, dim=-1)
+        output = weight @ v
+        output = output.transpose(1, 2).contiguous()
+        output = output.view(input_shape)
+        output = self.out_proj(output)
+        return output

clip.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+from nanograd.models.stable_diffusion.attention import SelfAttention
+class CLIPEmbedding(nn.Module):
+    def __init__(self, n_vocab: int, n_embd: int, n_token: int):
+        super().__init__()
+        self.token_embedding = nn.Embedding(n_vocab, n_embd)
+        self.position_embedding = nn.Parameter(torch.zeros((n_token, n_embd)))
+    def forward(self, tokens):
+        x = self.token_embedding(tokens)
+        x += self.position_embedding
+        return x
+class CLIPLayer(nn.Module):
+    def __init__(self, n_head: int, n_embd: int):
+        super().__init__()
+        self.layernorm_1 = nn.LayerNorm(n_embd)
+        self.attention = SelfAttention(n_head, n_embd)
+        self.layernorm_2 = nn.LayerNorm(n_embd)
+        self.linear_1 = nn.Linear(n_embd, 4 * n_embd)
+        self.linear_2 = nn.Linear(4 * n_embd, n_embd)
+    def forward(self, x):
+        residue = x
+        x = self.layernorm_1(x)
+        x = self.attention(x, causal_mask=True)
+        x += residue
+        residue = x
+        x = self.layernorm_2(x)
+        x = self.linear_1(x)
+        x = x * torch.sigmoid(1.702 * x)
+        x = self.linear_2(x)
+        x += residue
+        return x
+class CLIP(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.embedding = CLIPEmbedding(49408, 768, 77)
+        self.layers = nn.ModuleList([
+            CLIPLayer(12, 768) for i in range(12)
+        ])
+        self.layernorm = nn.LayerNorm(768)
+    def forward(self, tokens: torch.LongTensor) -> torch.FloatTensor:
+        tokens = tokens.type(torch.long)
+        state = self.embedding(tokens)
+        for layer in self.layers:
+            state = layer(state)
+        output = self.layernorm(state)
+        return output

ddpm.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import torch
+import numpy as np
+class DDPMSampler: # Denoising Diffusion Probabilistic Models Sampler
+    def __init__(self, generator: torch.Generator, num_training_steps=1000, beta_start: float = 0.00085, beta_end: float = 0.0120):
+        # Params "beta_start" and "beta_end" taken from: https://github.com/CompVis/stable-diffusion/blob/21f890f9da3cfbeaba8e2ac3c425ee9e998d5229/configs/stable-diffusion/v1-inference.yaml#L5C8-L5C8
+        # For the naming conventions, refer to the DDPM paper (https://arxiv.org/pdf/2006.11239.pdf)
+        self.betas = torch.linspace(beta_start ** 0.5, beta_end ** 0.5, num_training_steps, dtype=torch.float32) ** 2
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        self.one = torch.tensor(1.0)
+        self.generator = generator
+        self.num_train_timesteps = num_training_steps
+        self.timesteps = torch.from_numpy(np.arange(0, num_training_steps)[::-1].copy())
+    def set_inference_timesteps(self, num_inference_steps=50):
+        self.num_inference_steps = num_inference_steps
+        step_ratio = self.num_train_timesteps // self.num_inference_steps
+        timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+        self.timesteps = torch.from_numpy(timesteps)
+    def _get_previous_timestep(self, timestep: int) -> int:
+        prev_t = timestep - self.num_train_timesteps // self.num_inference_steps
+        return prev_t
+    def _get_variance(self, timestep: int) -> torch.Tensor:
+        prev_t = self._get_previous_timestep(timestep)
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
+        current_beta_t = 1 - alpha_prod_t / alpha_prod_t_prev
+        variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * current_beta_t
+        variance = torch.clamp(variance, min=1e-20)
+        return variance
+    def set_strength(self, strength=1):
+        """
+            Set how much noise to add to the input image.
+            More noise (strength ~ 1) means that the output will be further from the input image.
+            Less noise (strength ~ 0) means that the output will be closer to the input image.
+        """
+        # start_step is the number of noise levels to skip
+        start_step = self.num_inference_steps - int(self.num_inference_steps * strength)
+        self.timesteps = self.timesteps[start_step:]
+        self.start_step = start_step
+    def step(self, timestep: int, latents: torch.Tensor, model_output: torch.Tensor):
+        t = timestep
+        prev_t = self._get_previous_timestep(t)
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        current_alpha_t = alpha_prod_t / alpha_prod_t_prev
+        current_beta_t = 1 - current_alpha_t
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample = (latents - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t
+        current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * latents
+        variance = 0
+        if t > 0:
+            device = model_output.device
+            noise = torch.randn(model_output.shape, generator=self.generator, device=device, dtype=model_output.dtype)
+            # Compute the variance as per formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+            variance = (self._get_variance(t) ** 0.5) * noise
+        pred_prev_sample = pred_prev_sample + variance
+        return pred_prev_sample
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+        # Sample from q(x_t | x_0) as in equation (4) of https://arxiv.org/pdf/2006.11239.pdf
+        # Because N(mu, sigma) = X can be obtained by X = mu + sigma * N(0, 1)
+        # here mu = sqrt_alpha_prod * original_samples and sigma = sqrt_one_minus_alpha_prod
+        noise = torch.randn(original_samples.shape, generator=self.generator, device=original_samples.device, dtype=original_samples.dtype)
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples

decoder.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+from nanograd.models.stable_diffusion.attention import SelfAttention
+class VAE_AttentionBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.groupnorm = nn.GroupNorm(32, channels)
+        self.attention = SelfAttention(1, channels)
+    def forward(self, x):
+        residue = x
+        x = self.groupnorm(x)
+        n, c, h, w = x.shape
+        x = x.view((n, c, h * w))
+        x = x.transpose(-1, -2)
+        x = self.attention(x)
+        x = x.transpose(-1, -2)
+        x = x.view((n, c, h, w))
+        x += residue
+        return x
+class VAE_ResidualBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.groupnorm_1 = nn.GroupNorm(32, in_channels)
+        self.conv_1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
+        self.groupnorm_2 = nn.GroupNorm(32, out_channels)
+        self.conv_2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
+        if in_channels == out_channels:
+            self.residual_layer = nn.Identity()
+        else:
+            self.residual_layer = nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0)
+    def forward(self, x):
+        residue = x
+        x = self.groupnorm_1(x)
+        x = F.silu(x)
+        x = self.conv_1(x)
+        x = self.groupnorm_2(x)
+        x = F.silu(x)
+        x = self.conv_2(x)
+        return x + self.residual_layer(residue)
+class VAE_Decoder(nn.Sequential):
+    def __init__(self):
+        super().__init__(
+            nn.Conv2d(4, 4, kernel_size=1, padding=0),
+            nn.Conv2d(4, 512, kernel_size=3, padding=1),
+            VAE_ResidualBlock(512, 512),
+            VAE_AttentionBlock(512),
+            VAE_ResidualBlock(512, 512),
+            VAE_ResidualBlock(512, 512),
+            VAE_ResidualBlock(512, 512),
+            VAE_ResidualBlock(512, 512),
+            # (Batch_Size, 512, Height / 8, Width / 8) -> (Batch_Size, 512, Height / 4, Width / 4)
+            nn.Upsample(scale_factor=2),
+            nn.Conv2d(512, 512, kernel_size=3, padding=1),
+            VAE_ResidualBlock(512, 512),
+            VAE_ResidualBlock(512, 512),
+            VAE_ResidualBlock(512, 512),
+            # (Batch_Size, 512, Height / 4, Width / 4) -> (Batch_Size, 512, Height / 2, Width / 2)
+            nn.Upsample(scale_factor=2),
+            nn.Conv2d(512, 512, kernel_size=3, padding=1),
+            VAE_ResidualBlock(512, 256),
+            VAE_ResidualBlock(256, 256),
+            VAE_ResidualBlock(256, 256),
+            nn.Upsample(scale_factor=2),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
+            VAE_ResidualBlock(256, 128),
+            VAE_ResidualBlock(128, 128),
+            VAE_ResidualBlock(128, 128),
+            nn.GroupNorm(32, 128),
+            nn.SiLU(),
+            nn.Conv2d(128, 3, kernel_size=3, padding=1),
+        )
+    def forward(self, x):
+        x /= 0.18215
+        for module in self:
+            x = module(x)
+        return x

demo.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

diffusion.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+from nanograd.models.stable_diffusion.attention import SelfAttention, CrossAttention
+class TimeEmbedding(nn.Module):
+    def __init__(self, n_embd):
+        super().__init__()
+        self.linear_1 = nn.Linear(n_embd, 4 * n_embd)
+        self.linear_2 = nn.Linear(4 * n_embd, 4 * n_embd)
+    def forward(self, x):
+        x = self.linear_1(x)
+        x = F.silu(x)
+        x = self.linear_2(x)
+        return x
+class UNET_ResidualBlock(nn.Module):
+    def __init__(self, in_channels, out_channels, n_time=1280):
+        super().__init__()
+        self.groupnorm_feature = nn.GroupNorm(32, in_channels)
+        self.conv_feature = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
+        self.linear_time = nn.Linear(n_time, out_channels)
+        self.groupnorm_merged = nn.GroupNorm(32, out_channels)
+        self.conv_merged = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
+        if in_channels == out_channels:
+            self.residual_layer = nn.Identity()
+        else:
+            self.residual_layer = nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0)
+    def forward(self, feature, time):
+        residue = feature
+        feature = self.groupnorm_feature(feature)
+        feature = F.silu(feature)
+        feature = self.conv_feature(feature)
+        time = F.silu(time)
+        time = self.linear_time(time)
+        merged = feature + time.unsqueeze(-1).unsqueeze(-1)
+        merged = self.groupnorm_merged(merged)
+        merged = F.silu(merged)
+        merged = self.conv_merged(merged)
+        return merged + self.residual_layer(residue)
+class UNET_AttentionBlock(nn.Module):
+    def __init__(self, n_head: int, n_embd: int, d_context=768):
+        super().__init__()
+        channels = n_head * n_embd
+        self.groupnorm = nn.GroupNorm(32, channels, eps=1e-6)
+        self.conv_input = nn.Conv2d(channels, channels, kernel_size=1, padding=0)
+        self.layernorm_1 = nn.LayerNorm(channels)
+        self.attention_1 = SelfAttention(n_head, channels, in_proj_bias=False)
+        self.layernorm_2 = nn.LayerNorm(channels)
+        self.attention_2 = CrossAttention(n_head, channels, d_context, in_proj_bias=False)
+        self.layernorm_3 = nn.LayerNorm(channels)
+        self.linear_geglu_1  = nn.Linear(channels, 4 * channels * 2)
+        self.linear_geglu_2 = nn.Linear(4 * channels, channels)
+        self.conv_output = nn.Conv2d(channels, channels, kernel_size=1, padding=0)
+    def forward(self, x, context):
+        residue_long = x
+        x = self.groupnorm(x)
+        x = self.conv_input(x)
+        n, c, h, w = x.shape
+        x = x.view((n, c, h * w))
+        x = x.transpose(-1, -2)
+        residue_short = x
+        x = self.layernorm_1(x)
+        x = self.attention_1(x)
+        x += residue_short
+        residue_short = x
+        x = self.layernorm_2(x)
+        x = self.attention_2(x, context)
+        x += residue_short
+        residue_short = x
+        x = self.layernorm_3(x)
+        # GeGLU as implemented in the original code: https://github.com/CompVis/stable-diffusion/blob/21f890f9da3cfbeaba8e2ac3c425ee9e998d5229/ldm/modules/attention.py#L37C10-L37C10
+        x, gate = self.linear_geglu_1(x).chunk(2, dim=-1)
+        x = x * F.gelu(gate)
+        x = self.linear_geglu_2(x)
+        x += residue_short
+        x = x.transpose(-1, -2)
+        x = x.view((n, c, h, w))
+        return self.conv_output(x) + residue_long
+class Upsample(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.conv = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+    def forward(self, x):
+        x = F.interpolate(x, scale_factor=2, mode='nearest')
+        return self.conv(x)
+class SwitchSequential(nn.Sequential):
+    def forward(self, x, context, time):
+        for layer in self:
+            if isinstance(layer, UNET_AttentionBlock):
+                x = layer(x, context)
+            elif isinstance(layer, UNET_ResidualBlock):
+                x = layer(x, time)
+            else:
+                x = layer(x)
+        return x
+class UNET(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.encoders = nn.ModuleList([
+            SwitchSequential(nn.Conv2d(4, 320, kernel_size=3, padding=1)),
+            SwitchSequential(UNET_ResidualBlock(320, 320), UNET_AttentionBlock(8, 40)),
+            SwitchSequential(UNET_ResidualBlock(320, 320), UNET_AttentionBlock(8, 40)),
+            SwitchSequential(nn.Conv2d(320, 320, kernel_size=3, stride=2, padding=1)),
+            SwitchSequential(UNET_ResidualBlock(320, 640), UNET_AttentionBlock(8, 80)),
+            SwitchSequential(UNET_ResidualBlock(640, 640), UNET_AttentionBlock(8, 80)),
+            SwitchSequential(nn.Conv2d(640, 640, kernel_size=3, stride=2, padding=1)),
+            SwitchSequential(UNET_ResidualBlock(640, 1280), UNET_AttentionBlock(8, 160)),
+            SwitchSequential(UNET_ResidualBlock(1280, 1280), UNET_AttentionBlock(8, 160)),
+            SwitchSequential(nn.Conv2d(1280, 1280, kernel_size=3, stride=2, padding=1)),
+            SwitchSequential(UNET_ResidualBlock(1280, 1280)),
+            SwitchSequential(UNET_ResidualBlock(1280, 1280)),
+        ])
+        self.bottleneck = SwitchSequential(
+            UNET_ResidualBlock(1280, 1280),
+            UNET_AttentionBlock(8, 160),
+            UNET_ResidualBlock(1280, 1280),
+        )
+        self.decoders = nn.ModuleList([
+            SwitchSequential(UNET_ResidualBlock(2560, 1280)),
+            SwitchSequential(UNET_ResidualBlock(2560, 1280)),
+            SwitchSequential(UNET_ResidualBlock(2560, 1280), Upsample(1280)),
+            SwitchSequential(UNET_ResidualBlock(2560, 1280), UNET_AttentionBlock(8, 160)),
+            SwitchSequential(UNET_ResidualBlock(2560, 1280), UNET_AttentionBlock(8, 160)),
+            SwitchSequential(UNET_ResidualBlock(1920, 1280), UNET_AttentionBlock(8, 160), Upsample(1280)),
+            SwitchSequential(UNET_ResidualBlock(1920, 640), UNET_AttentionBlock(8, 80)),
+            SwitchSequential(UNET_ResidualBlock(1280, 640), UNET_AttentionBlock(8, 80)),
+            SwitchSequential(UNET_ResidualBlock(960, 640), UNET_AttentionBlock(8, 80), Upsample(640)),
+            SwitchSequential(UNET_ResidualBlock(960, 320), UNET_AttentionBlock(8, 40)),
+            SwitchSequential(UNET_ResidualBlock(640, 320), UNET_AttentionBlock(8, 40)),
+            SwitchSequential(UNET_ResidualBlock(640, 320), UNET_AttentionBlock(8, 40)),
+        ])
+    def forward(self, x, context, time):
+        skip_connections = []
+        for layers in self.encoders:
+            x = layers(x, context, time)
+            skip_connections.append(x)
+        x = self.bottleneck(x, context, time)
+        for layers in self.decoders:
+            x = torch.cat((x, skip_connections.pop()), dim=1)
+            x = layers(x, context, time)
+        return x
+class UNET_OutputLayer(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.groupnorm = nn.GroupNorm(32, in_channels)
+        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
+    def forward(self, x):
+        x = self.groupnorm(x)
+        x = F.silu(x)
+        x = self.conv(x)
+        return x
+class Diffusion(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.time_embedding = TimeEmbedding(320)
+        self.unet = UNET()
+        self.final = UNET_OutputLayer(320, 4)
+    def forward(self, latent, context, time):
+        time = self.time_embedding(time)
+        output = self.unet(latent, context, time)
+        output = self.final(output)
+        return output

encoder.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+from nanograd.models.stable_diffusion.decoder import VAE_AttentionBlock, VAE_ResidualBlock
+class VAE_Encoder(nn.Sequential):
+    def __init__(self):
+        super().__init__(
+            nn.Conv2d(3, 128, kernel_size=3, padding=1),
+            VAE_ResidualBlock(128, 128),
+            VAE_ResidualBlock(128, 128),
+            nn.Conv2d(128, 128, kernel_size=3, stride=2, padding=0),
+            VAE_ResidualBlock(128, 256),
+            VAE_ResidualBlock(256, 256),
+            nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=0),
+            VAE_ResidualBlock(256, 512),
+            VAE_ResidualBlock(512, 512),
+            nn.Conv2d(512, 512, kernel_size=3, stride=2, padding=0),
+            VAE_ResidualBlock(512, 512),
+            VAE_ResidualBlock(512, 512),
+            VAE_ResidualBlock(512, 512),
+            VAE_AttentionBlock(512),
+            VAE_ResidualBlock(512, 512),
+            nn.GroupNorm(32, 512),
+            nn.SiLU(),
+            nn.Conv2d(512, 8, kernel_size=3, padding=1),
+            nn.Conv2d(8, 8, kernel_size=1, padding=0),
+        )
+    def forward(self, x, noise):
+        for module in self:
+            if getattr(module, 'stride', None) == (2, 2):
+                x = F.pad(x, (0, 1, 0, 1))
+            x = module(x)
+        mean, log_variance = torch.chunk(x, 2, dim=1)
+        log_variance = torch.clamp(log_variance, -30, 20)
+        variance = log_variance.exp()
+        stdev = variance.sqrt()
+        x = mean + stdev * noise
+        # Constant taken from: https://github.com/CompVis/stable-diffusion/blob/21f890f9da3cfbeaba8e2ac3c425ee9e998d5229/configs/stable-diffusion/v1-inference.yaml#L17C1-L17C1
+        x *= 0.18215
+        return x

gaussing_diffusion.py ADDED Viewed

	@@ -0,0 +1,814 @@

+import torch
+import math
+import numpy as np
+import enum
+class GaussingDistribution:
+    def __init__(self, parameters: torch.Tensor) -> None:
+        self.mean, log_variance = torch.chunk(parameters, 2, dim=1)
+        self.log_variance = torch.clamp(log_variance, -30.0, 20.0)
+        self.std = torch.exp(0.5 * self.log_variance)
+    def sample(self):
+        return self.mean + self.std * torch.rand_like(self.std)
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, torch.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+    logvar1, logvar2 = [
+        x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + torch.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
+    )
+def approx_standard_normal_cdf(x):
+    return 0.5 * (1.0 + torch.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * torch.pow(x, 3))))
+def continuous_gaussian_log_likelihood(x, *, means, log_scales):
+    centered_x = x - means
+    inv_stdv = torch.exp(-log_scales)
+    normalized_x = centered_x * inv_stdv
+    log_probs = torch.distributions.Normal(torch.zeros_like(x), torch.ones_like(x)).log_prob(normalized_x)
+    return log_probs
+def discretized_gaussian_log_likelihood(x, *, means, log_scales):
+    assert x.shape == means.shape == log_scales.shape
+    centered_x = x - means
+    inv_stdv = torch.exp(-log_scales)
+    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
+    cdf_plus = approx_standard_normal_cdf(plus_in)
+    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
+    cdf_min = approx_standard_normal_cdf(min_in)
+    log_cdf_plus = torch.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = torch.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = torch.where(
+        x < -0.999,
+        log_cdf_plus,
+        torch.where(x > 0.999, log_one_minus_cdf_min, torch.log(cdf_delta.clamp(min=1e-12))),
+    )
+    assert log_probs.shape == x.shape
+    return log_probs
+################# Gaussing ####################
+def mean_flat(tensor):
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+class ModelMeanType(enum.Enum):
+    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
+    START_X = enum.auto()  # the model predicts x_0
+    EPSILON = enum.auto()  # the model predicts epsilon
+class ModelVarType(enum.Enum):
+    LEARNED = enum.auto()
+    FIXED_SMALL = enum.auto()
+    FIXED_LARGE = enum.auto()
+    LEARNED_RANGE = enum.auto()
+class LossType(enum.Enum):
+    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
+    RESCALED_MSE = (
+        enum.auto()
+    )  # use raw MSE loss (with RESCALED_KL when learning variances)
+    KL = enum.auto()  # use the variational lower-bound
+    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB
+    def is_vb(self):
+        return self == LossType.KL or self == LossType.RESCALED_KL
+def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
+    betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    warmup_time = int(num_diffusion_timesteps * warmup_frac)
+    betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
+    return betas
+def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
+    if beta_schedule == "quad":
+        betas = (
+            np.linspace(
+                beta_start ** 0.5,
+                beta_end ** 0.5,
+                num_diffusion_timesteps,
+                dtype=np.float64,
+            )
+            ** 2
+        )
+    elif beta_schedule == "linear":
+        betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "warmup10":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
+    elif beta_schedule == "warmup50":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
+    elif beta_schedule == "const":
+        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
+        betas = 1.0 / np.linspace(
+            num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64
+        )
+    else:
+        raise NotImplementedError(beta_schedule)
+    assert betas.shape == (num_diffusion_timesteps,)
+    return betas
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
+    if schedule_name == "linear":
+        scale = 1000 / num_diffusion_timesteps
+        return get_beta_schedule(
+            "linear",
+            beta_start=scale * 0.0001,
+            beta_end=scale * 0.02,
+            num_diffusion_timesteps=num_diffusion_timesteps,
+        )
+    elif schedule_name == "squaredcos_cap_v2":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    else:
+        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+class GaussianDiffusion:
+    def __init__(
+        self,
+        *,
+        betas,
+        model_mean_type,
+        model_var_type,
+        loss_type
+    ):
+        self.model_mean_type = model_mean_type
+        self.model_var_type = model_var_type
+        self.loss_type = loss_type
+        # Use float64 for accuracy.
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        assert len(betas.shape) == 1, "betas must be 1-D"
+        assert (betas > 0).all() and (betas <= 1).all()
+        self.num_timesteps = int(betas.shape[0])
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = (
+            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.posterior_log_variance_clipped = np.log(
+            np.append(self.posterior_variance[1], self.posterior_variance[1:])
+        ) if len(self.posterior_variance) > 1 else np.array([])
+        self.posterior_mean_coef1 = (
+            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        self.posterior_mean_coef2 = (
+            (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
+        )
+    def q_mean_variance(self, x_start, t):
+        mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+        return mean, variance, log_variance
+    def q_sample(self, x_start, t, noise=None):
+        if noise is None:
+            noise = torch.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        return (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        assert x_start.shape == x_t.shape
+        posterior_mean = (
+            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = _extract_into_tensor(
+            self.posterior_log_variance_clipped, t, x_t.shape
+        )
+        assert (
+            posterior_mean.shape[0]
+            == posterior_variance.shape[0]
+            == posterior_log_variance_clipped.shape[0]
+            == x_start.shape[0]
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None):
+        if model_kwargs is None:
+            model_kwargs = {}
+        B, C = x.shape[:2]
+        assert t.shape == (B,)
+        model_output = model(x, t, **model_kwargs)
+        if isinstance(model_output, tuple):
+            model_output, extra = model_output
+        else:
+            extra = None
+        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
+            assert model_output.shape == (B, C * 2, *x.shape[2:])
+            model_output, model_var_values = torch.split(model_output, C, dim=1)
+            min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
+            max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
+            frac = (model_var_values + 1) / 2
+            model_log_variance = frac * max_log + (1 - frac) * min_log
+            model_variance = torch.exp(model_log_variance)
+        else:
+            model_variance, model_log_variance = {
+                ModelVarType.FIXED_LARGE: (
+                    np.append(self.posterior_variance[1], self.betas[1:]),
+                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+                ),
+                ModelVarType.FIXED_SMALL: (
+                    self.posterior_variance,
+                    self.posterior_log_variance_clipped,
+                ),
+            }[self.model_var_type]
+            model_variance = _extract_into_tensor(model_variance, t, x.shape)
+            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
+        def process_xstart(x):
+            if denoised_fn is not None:
+                x = denoised_fn(x)
+            if clip_denoised:
+                return x.clamp(-1, 1)
+            return x
+        if self.model_mean_type == ModelMeanType.START_X:
+            pred_xstart = process_xstart(model_output)
+        else:
+            pred_xstart = process_xstart(
+                self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
+            )
+        model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)
+        assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
+        return {
+            "mean": model_mean,
+            "variance": model_variance,
+            "log_variance": model_log_variance,
+            "pred_xstart": pred_xstart,
+            "extra": extra,
+        }
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+        )
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        gradient = cond_fn(x, t, **model_kwargs)
+        new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
+        return new_mean
+    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)
+        out = p_mean_var.copy()
+        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
+        out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
+        return out
+    def p_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+    ):
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        noise = torch.randn_like(x)
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        if cond_fn is not None:
+            out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        sample = out["mean"] + nonzero_mask * torch.exp(0.5 * out["log_variance"]) * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def p_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        final = None
+        for sample in self.p_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+        ):
+            final = sample
+        return final["sample"]
+    def p_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = torch.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = torch.tensor([i] * shape[0], device=device)
+            with torch.no_grad():
+                out = self.p_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                )
+                yield out
+                img = out["sample"]
+    def ddim_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = (
+            eta
+            * torch.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
+            * torch.sqrt(1 - alpha_bar / alpha_bar_prev)
+        )
+        # Equation 12.
+        noise = torch.randn_like(x)
+        mean_pred = (
+            out["pred_xstart"] * torch.sqrt(alpha_bar_prev)
+            + torch.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
+        )
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        sample = mean_pred + nonzero_mask * sigma * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def ddim_reverse_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        assert eta == 0.0, "Reverse ODE only for deterministic path"
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        eps = (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
+            - out["pred_xstart"]
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
+        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
+        # Equation 12. reversed
+        mean_pred = out["pred_xstart"] * torch.sqrt(alpha_bar_next) + torch.sqrt(1 - alpha_bar_next) * eps
+        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
+    def ddim_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        final = None
+        for sample in self.ddim_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            eta=eta,
+        ):
+            final = sample
+        return final["sample"]
+    def ddim_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = torch.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = torch.tensor([i] * shape[0], device=device)
+            with torch.no_grad():
+                out = self.ddim_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    eta=eta,
+                )
+                yield out
+                img = out["sample"]
+    def _vb_terms_bpd(
+            self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None
+    ):
+        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
+            x_start=x_start, x_t=x_t, t=t
+        )
+        out = self.p_mean_variance(
+            model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
+        )
+        kl = normal_kl(
+            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
+        )
+        kl = mean_flat(kl) / np.log(2.0)
+        decoder_nll = -discretized_gaussian_log_likelihood(
+            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
+        )
+        assert decoder_nll.shape == x_start.shape
+        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
+        # At the first timestep return the decoder NLL,
+        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
+        output = torch.where((t == 0), decoder_nll, kl)
+        return {"output": output, "pred_xstart": out["pred_xstart"]}
+    def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
+        if model_kwargs is None:
+            model_kwargs = {}
+        if noise is None:
+            noise = torch.randn_like(x_start)
+        x_t = self.q_sample(x_start, t, noise=noise)
+        terms = {}
+        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
+            terms["loss"] = self._vb_terms_bpd(
+                model=model,
+                x_start=x_start,
+                x_t=x_t,
+                t=t,
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+            )["output"]
+            if self.loss_type == LossType.RESCALED_KL:
+                terms["loss"] *= self.num_timesteps
+        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
+            model_output = model(x_t, t, **model_kwargs)
+            if self.model_var_type in [
+                ModelVarType.LEARNED,
+                ModelVarType.LEARNED_RANGE,
+            ]:
+                B, C = x_t.shape[:2]
+                assert model_output.shape == (B, C * 2, *x_t.shape[2:])
+                model_output, model_var_values = torch.split(model_output, C, dim=1)
+                frozen_out = torch.cat([model_output.detach(), model_var_values], dim=1)
+                terms["vb"] = self._vb_terms_bpd(
+                    model=lambda *args, r=frozen_out: r,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t,
+                    clip_denoised=False,
+                )["output"]
+                if self.loss_type == LossType.RESCALED_MSE:
+                    terms["vb"] *= self.num_timesteps / 1000.0
+            target = {
+                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
+                    x_start=x_start, x_t=x_t, t=t
+                )[0],
+                ModelMeanType.START_X: x_start,
+                ModelMeanType.EPSILON: noise,
+            }[self.model_mean_type]
+            assert model_output.shape == target.shape == x_start.shape
+            terms["mse"] = mean_flat((target - model_output) ** 2)
+            if "vb" in terms:
+                terms["loss"] = terms["mse"] + terms["vb"]
+            else:
+                terms["loss"] = terms["mse"]
+        else:
+            raise NotImplementedError(self.loss_type)
+        return terms
+    def _prior_bpd(self, x_start):
+        batch_size = x_start.shape[0]
+        t = torch.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
+        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+        kl_prior = normal_kl(
+            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
+        )
+        return mean_flat(kl_prior) / np.log(2.0)
+    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
+        device = x_start.device
+        batch_size = x_start.shape[0]
+        vb = []
+        xstart_mse = []
+        mse = []
+        for t in list(range(self.num_timesteps))[::-1]:
+            t_batch = torch.tensor([t] * batch_size, device=device)
+            noise = torch.randn_like(x_start)
+            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
+            # Calculate VLB term at the current timestep
+            with torch.no_grad():
+                out = self._vb_terms_bpd(
+                    model,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t_batch,
+                    clip_denoised=clip_denoised,
+                    model_kwargs=model_kwargs,
+                )
+            vb.append(out["output"])
+            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
+            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
+            mse.append(mean_flat((eps - noise) ** 2))
+        vb = torch.stack(vb, dim=1)
+        xstart_mse = torch.stack(xstart_mse, dim=1)
+        mse = torch.stack(mse, dim=1)
+        prior_bpd = self._prior_bpd(x_start)
+        total_bpd = vb.sum(dim=1) + prior_bpd
+        return {
+            "total_bpd": total_bpd,
+            "prior_bpd": prior_bpd,
+            "vb": vb,
+            "xstart_mse": xstart_mse,
+            "mse": mse,
+        }
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    res = torch.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res + torch.zeros(broadcast_shape, device=timesteps.device)
+############################### Denoising Diffusion Probabilistic Model###################################
+class DDPMSampler:
+    def __init__(self, generator: torch.Generator, num_training_steps=1000, beta_start: float = 0.00085, beta_end: float = 0.0120):
+        self.betas = torch.linspace(beta_start ** 0.5, beta_end ** 0.5, num_training_steps, dtype=torch.float32) ** 2
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, d_model=0)
+        self.one = torch.tensor(1.0)
+        self.generator = generator
+        self.num_train_timesteps = num_training_steps
+        self.timesteps = torch.from_numpy(np.arange(0, num_training_steps)[::-1].copy())
+    def set_inference_timesteps(self, num_inference_steps=50):
+        self.num_inference_steps = num_inference_steps
+        step_ratio = self.num_train_timesteps // self.num_inference_steps
+        timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+        self.timesteps = torch.from_numpy(timesteps)
+    def _get_previous_timestep(self, timestep: int) -> int:
+        prev_t = timestep - self.num_train_timesteps // self.num_inference_steps
+        return prev_t
+    def _get_variance(self, timestep: int) -> torch.Tensor:
+        prev_t = self._get_previous_timestep(timestep)
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
+        current_beta_t = 1 - alpha_prod_t / alpha_prod_t_prev
+        variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * current_beta_t
+        variance = torch.clamp(variance, min=1e-20)
+        return variance
+    def set_strength(self, strength=1):
+        """
+            Set how much noise to add to the input image.
+            More noise (strength ~ 1) means that the output will be further from the input image.
+            Less noise (strength ~ 0) means that the output will be closer to the input image.
+        """
+        start_step = self.num_inference_steps - int(self.num_inference_steps * strength)
+        self.timesteps = self.timesteps[start_step:]
+        self.start_step = start_step
+    def step(self, timestep: int, latents: torch.Tensor, model_output: torch.Tensor):
+        t = timestep
+        prev_t = self._get_previous_timestep(t)
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        current_alpha_t = alpha_prod_t / alpha_prod_t_prev
+        current_beta_t = 1 - current_alpha_t
+        pred_original_sample = (latents - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t
+        current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev /  beta_prod_t
+        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * latents
+        variance = 0
+        if t > 0:
+            device = model_output.device
+            noise = torch.randn(model_output.shape, generator=self.generator, device=device, dtype=model_output.dtype)
+            variance = (self._get_variance(t) ** 0.5) * noise
+        pred_prev_sample = pred_prev_sample + variance
+        return pred_prev_sample
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+        noise = torch.randn(original_samples.shape, generator=self.generator, device=original_samples.device, dtype=original_samples.dtype)
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples

model_converter.py ADDED Viewed

The diff for this file is too large to render. See raw diff

model_loader.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from nanograd.models.stable_diffusion.clip import CLIP
+from nanograd.models.stable_diffusion.encoder import VAE_Encoder
+from nanograd.models.stable_diffusion.decoder import VAE_Decoder
+from nanograd.models.stable_diffusion.diffusion import Diffusion
+from nanograd.models.stable_diffusion import model_converter
+def preload_models_from_standard_weights(ckpt_path, device):
+    state_dict = model_converter.load_from_standard_weights(ckpt_path, device)
+    encoder = VAE_Encoder().to(device)
+    encoder.load_state_dict(state_dict['encoder'], strict=True)
+    decoder = VAE_Decoder().to(device)
+    decoder.load_state_dict(state_dict['decoder'], strict=True)
+    diffusion = Diffusion().to(device)
+    diffusion.load_state_dict(state_dict['diffusion'], strict=True)
+    clip = CLIP().to(device)
+    clip.load_state_dict(state_dict['clip'], strict=True)
+    return {
+        'clip': clip,
+        'encoder': encoder,
+        'decoder': decoder,
+        'diffusion': diffusion,
+    }

pipeline.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import torch
+import numpy as np
+from tqdm import tqdm
+from nanograd.models.stable_diffusion.ddpm import DDPMSampler
+WIDTH = 512
+HEIGHT = 512
+LATENTS_WIDTH = WIDTH // 8
+LATENTS_HEIGHT = HEIGHT // 8
+def generate(
+    prompt,
+    uncond_prompt=None,
+    input_image=None,
+    strength=0.8,
+    do_cfg=True,
+    cfg_scale=7.5,
+    sampler_name="ddpm",
+    n_inference_steps=50,
+    models={},
+    seed=None,
+    device=None,
+    idle_device=None,
+    tokenizer=None,
+):
+    with torch.no_grad():
+        if not 0 < strength <= 1:
+            raise ValueError("strength must be between 0 and 1")
+        if idle_device:
+            to_idle = lambda x: x.to(idle_device)
+        else:
+            to_idle = lambda x: x
+        generator = torch.Generator(device=device)
+        if seed is None:
+            generator.seed()
+        else:
+            generator.manual_seed(seed)
+        clip = models["clip"]
+        clip.to(device)
+        if do_cfg:
+            cond_tokens = tokenizer.batch_encode_plus(
+                [prompt], padding="max_length", max_length=77
+            ).input_ids
+            cond_tokens = torch.tensor(cond_tokens, dtype=torch.long, device=device)
+            cond_context = clip(cond_tokens)
+            uncond_tokens = tokenizer.batch_encode_plus(
+                [uncond_prompt], padding="max_length", max_length=77
+            ).input_ids
+            uncond_tokens = torch.tensor(uncond_tokens, dtype=torch.long, device=device)
+            uncond_context = clip(uncond_tokens)
+            context = torch.cat([cond_context, uncond_context])
+        else:
+            tokens = tokenizer.batch_encode_plus(
+                [prompt], padding="max_length", max_length=77
+            ).input_ids
+            tokens = torch.tensor(tokens, dtype=torch.long, device=device)
+            context = clip(tokens)
+        to_idle(clip)
+        if sampler_name == "ddpm":
+            sampler = DDPMSampler(generator)
+            sampler.set_inference_timesteps(n_inference_steps)
+        else:
+            raise ValueError("Unknown sampler value %s. ")
+        latents_shape = (1, 4, LATENTS_HEIGHT, LATENTS_WIDTH)
+        if input_image:
+            encoder = models["encoder"]
+            encoder.to(device)
+            input_image_tensor = input_image.resize((WIDTH, HEIGHT))
+            input_image_tensor = np.array(input_image_tensor)
+            input_image_tensor = torch.tensor(input_image_tensor, dtype=torch.float32, device=device)
+            input_image_tensor = rescale(input_image_tensor, (0, 255), (-1, 1))
+            input_image_tensor = input_image_tensor.unsqueeze(0)
+            input_image_tensor = input_image_tensor.permute(0, 3, 1, 2)
+            encoder_noise = torch.randn(latents_shape, generator=generator, device=device)
+            latents = encoder(input_image_tensor, encoder_noise)
+            sampler.set_strength(strength=strength)
+            latents = sampler.add_noise(latents, sampler.timesteps[0])
+            to_idle(encoder)
+        else:
+            latents = torch.randn(latents_shape, generator=generator, device=device)
+        diffusion = models["diffusion"]
+        diffusion.to(device)
+        timesteps = tqdm(sampler.timesteps)
+        for i, timestep in enumerate(timesteps):
+            time_embedding = get_time_embedding(timestep).to(device)
+            model_input = latents
+            if do_cfg:
+                model_input = model_input.repeat(2, 1, 1, 1)
+            model_output = diffusion(model_input, context, time_embedding)
+            if do_cfg:
+                output_cond, output_uncond = model_output.chunk(2)
+                model_output = cfg_scale * (output_cond - output_uncond) + output_uncond
+            latents = sampler.step(timestep, latents, model_output)
+        to_idle(diffusion)
+        decoder = models["decoder"]
+        decoder.to(device)
+        images = decoder(latents)
+        to_idle(decoder)
+        images = rescale(images, (-1, 1), (0, 255), clamp=True)
+        images = images.permute(0, 2, 3, 1)
+        images = images.to("cpu", torch.uint8).numpy()
+        return images[0]
+def rescale(x, old_range, new_range, clamp=False):
+    old_min, old_max = old_range
+    new_min, new_max = new_range
+    x -= old_min
+    x *= (new_max - new_min) / (old_max - old_min)
+    x += new_min
+    if clamp:
+        x = x.clamp(new_min, new_max)
+    return x
+def get_time_embedding(timestep):
+    # Shape: (160,)
+    freqs = torch.pow(10000, -torch.arange(start=0, end=160, dtype=torch.float32) / 160)
+    # Shape: (1, 160)
+    x = torch.tensor([timestep], dtype=torch.float32)[:, None] * freqs[None]
+    # Shape: (1, 160 * 2)
+    return torch.cat([torch.cos(x), torch.sin(x)], dim=-1)

sd_gradio.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import gradio as gr
+from PIL import Image
+from pathlib import Path
+from transformers import CLIPTokenizer
+import torch
+from nanograd.models.stable_diffusion import model_loader, pipeline
+DEVICE = "cpu"
+ALLOW_CUDA = False
+ALLOW_MPS = False
+if torch.cuda.is_available() and ALLOW_CUDA:
+    DEVICE = "cuda"
+elif torch.backends.mps.is_available() and ALLOW_MPS:
+    DEVICE = "mps"
+print(f"Using device: {DEVICE}")
+tokenizer_vocab_path = Path("C:\\Users\\Esmail\\Desktop\\nanograd\\nanograd\\models\\stable_diffusion\\sd_data\\tokenizer_vocab.json")
+tokenizer_merges_path = Path("C:\\Users\\Esmail\\Desktop\\nanograd\\nanograd\\models\\stable_diffusion\\sd_data\\tokenizer_merges.txt")
+model_file = Path("C:\\Users\\Esmail\\Desktop\\nanograd\\nanograd\\models\\stable_diffusion\\sd_data\\v1-5-pruned-emaonly.ckpt")
+tokenizer = CLIPTokenizer(str(tokenizer_vocab_path), merges_file=str(tokenizer_merges_path))
+models = model_loader.preload_models_from_standard_weights(str(model_file), DEVICE)
+def generate_image(prompt, cfg_scale, num_inference_steps, sampler):
+    uncond_prompt = ""
+    do_cfg = True
+    input_image = None
+    strength = 0.9
+    seed = 42
+    output_image = pipeline.generate(
+        prompt=prompt,
+        uncond_prompt=uncond_prompt,
+        input_image=input_image,
+        strength=strength,
+        do_cfg=do_cfg,
+        cfg_scale=cfg_scale,
+        sampler_name=sampler,
+        n_inference_steps=num_inference_steps,
+        seed=seed,
+        models=models,
+        device=DEVICE,
+        idle_device="cpu",
+        tokenizer=tokenizer,
+    )
+    output_image = Image.fromarray(output_image)
+    return output_image
+# Gradio interface
+def gradio_interface():
+    with gr.Blocks() as demo:
+        with gr.Row():
+            with gr.Column(scale=2):
+                prompt_input = gr.Textbox(label="Prompt", placeholder="A cat stretching on the floor, highly detailed, ultra sharp, cinematic, 100mm lens, 8k resolution")
+                cfg_scale = gr.Slider(label="CFG Scale", minimum=1, maximum=20, value=7, step=1)
+                num_inference_steps = gr.Slider(label="Sampling Steps", minimum=10, maximum=100, value=20, step=5)
+                sampler = gr.Radio(label="Sampling Method", choices=["ddpm", "Euler a", "Euler", "LMS", "Heun", "DPM2 a", "PLMS"], value="ddpm")
+                generate_btn = gr.Button("Generate", variant="primary")
+            with gr.Column(scale=2):
+                output_image = gr.Image(label="Output", show_label=False, height=512, width=512)
+        generate_btn.click(fn=generate_image, inputs=[prompt_input, cfg_scale, num_inference_steps, sampler], outputs=output_image)
+    demo.launch()
+if __name__ == "__main__":
+    gradio_interface()

sd_inference.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from nanograd.models.stable_diffusion import model_loader
+from nanograd.models.stable_diffusion import pipeline
+from PIL import Image
+from pathlib import Path
+from transformers import CLIPTokenizer
+import torch
+DEVICE = "cpu"
+ALLOW_CUDA = False
+ALLOW_MPS = False
+if torch.cuda.is_available() and ALLOW_CUDA:
+    DEVICE = "cuda"
+elif (torch.has_mps or torch.backends.mps.is_available()) and ALLOW_MPS:
+    DEVICE = "mps"
+print(f"Using device: {DEVICE}")
+tokenizer = CLIPTokenizer("nanograd\models\stable_diffusion\sd_data\\tokenizer_vocab.json", merges_file="nanograd\models\stable_diffusion\sd_data\\tokenizer_merges.txt")
+model_file = "nanograd\models\stable_diffusion\sd_data\\v1-5-pruned-emaonly.ckpt"
+models = model_loader.preload_models_from_standard_weights(model_file, DEVICE)
+## TEXT TO IMAGE
+prompt = input("Enter your prompt: ")
+# prompt = "A cat stretching on the floor, highly detailed, ultra sharp, cinematic, 100mm lens, 8k resolution."
+uncond_prompt = ""
+do_cfg = True
+cfg_scale = 8  # min: 1, max: 14
+## IMAGE TO IMAGE
+input_image = None
+# Comment to disable image to image
+# image_path = "../images/dog.jpg"
+# input_image = Image.open(image_path)
+# Higher values means more noise will be added to the input image, so the result will further from the input image.
+strength = 0.9
+## SAMPLER
+sampler = "ddpm"
+num_inference_steps = 50
+seed = 42
+def run():
+    output_image = pipeline.generate(
+        prompt=prompt,
+        uncond_prompt=uncond_prompt,
+        input_image=input_image,
+        strength=strength,
+        do_cfg=do_cfg,
+        cfg_scale=cfg_scale,
+        sampler_name=sampler,
+        n_inference_steps=num_inference_steps,
+        seed=seed,
+        models=models,
+        device=DEVICE,
+        idle_device="cpu",
+        tokenizer=tokenizer,
+    )
+    output_image = Image.fromarray(output_image)
+    output_path = "nanograd\models\stable_diffusion\output\\c.png"
+    output_image.save(output_path)
+    print(f"Image saved as {output_path}")
+if __name__ == "__main__":
+    run()