Spaces:

songweig
/

rich-text-to-image

Runtime error

App Files Files Community

Songwei Ge commited on Mar 27, 2023

Commit

4c022fe

1 Parent(s): 14a857e

demo!

Browse files

Files changed (9) hide show

app.py +223 -5
models/attention.py +892 -0
models/region_diffusion.py +307 -0
models/unet_2d_blocks.py +1670 -0
models/unet_2d_condition.py +411 -0
sample.py +109 -0
utils/.DS_Store +0 -0
utils/attention_utils.py +201 -0
utils/richtext_utils.py +234 -0

app.py CHANGED Viewed

@@ -1,10 +1,228 @@
 import gradio as gr
-HTML = "<!-- Include stylesheet -->\n<link href=\"https://cdn.quilljs.com/1.3.6/quill.snow.css\" rel=\"stylesheet\">\n\n<!-- Create the editor container -->\n<div id=\"editor\">\n  <p>Hello World!</p>\n  <p>Some initial <strong>bold</strong> text</p>\n  <p><br></p>\n</div>\n\n<!-- Include the Quill library -->\n<script src=\"https://cdn.quilljs.com/1.3.6/quill.js\"></script>\n\n<!-- Initialize Quill editor -->\n<script>\n  var quill = new Quill('#editor', {\n    theme: 'snow'\n  });\n</script>"
-def greet(name):
-    return HTML, "Hello " + name + "!!"
-iface = gr.Interface(greet, gr.Textbox(placeholder="Enter sentence here..."), ["html", "text"])
-iface.launch()

+import math
+import random
+import os
+import json
+import time
+import argparse
+import imageio
+import torch
+import numpy as np
+from torchvision import transforms
+from models.region_diffusion import RegionDiffusion
+from utils.attention_utils import get_token_maps
+from utils.richtext_utils import seed_everything, parse_json, get_region_diffusion_input,\
+    get_attention_control_input, get_gradient_guidance_input
 import gradio as gr
+from PIL import Image, ImageOps
+help_text = """
+Instructions placeholder.
+"""
+example_instructions = [
+    "Make it a picasso painting",
+    "as if it were by modigliani",
+    "convert to a bronze statue",
+    "Turn it into an anime.",
+    "have it look like a graphic novel",
+    "make him gain weight",
+    "what would he look like bald?",
+    "Have him smile",
+    "Put him in a cocktail party.",
+    "move him at the beach.",
+    "add dramatic lighting",
+    "Convert to black and white",
+    "What if it were snowing?",
+    "Give him a leather jacket",
+    "Turn him into a cyborg!",
+    "make him wear a beanie",
+]
+def main():
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = RegionDiffusion(device)
+    def generate(
+        text_input: str,
+        negative_text: str,
+        height: int,
+        width: int,
+        seed: int,
+        steps: int,
+        guidance_weight: float,
+    ):
+        run_dir = 'results/'
+        # Load region diffusion model.
+        steps = 41 if not steps else steps
+        guidance_weight = 8.5 if not guidance_weight else guidance_weight
+        # parse json to span attributes
+        base_text_prompt, style_text_prompts, footnote_text_prompts, footnote_target_tokens,\
+            color_text_prompts, color_names, color_rgbs, size_text_prompts_and_sizes, use_grad_guidance = parse_json(
+                text_input)
+        # create control input for region diffusion
+        region_text_prompts, region_target_token_ids, base_tokens = get_region_diffusion_input(
+            model, base_text_prompt, style_text_prompts, footnote_text_prompts,
+            footnote_target_tokens, color_text_prompts, color_names)
+        # create control input for cross attention
+        text_format_dict = get_attention_control_input(
+            model, base_tokens, size_text_prompts_and_sizes)
+        # create control input for region guidance
+        text_format_dict, color_target_token_ids = get_gradient_guidance_input(
+            model, base_tokens, color_text_prompts, color_rgbs, text_format_dict)
+        seed_everything(seed)
+        # get token maps from plain text to image generation.
+        begin_time = time.time()
+        if model.attention_maps is None:
+            model.register_evaluation_hooks()
+        else:
+            model.reset_attention_maps()
+        plain_img = model.produce_attn_maps([base_text_prompt], [negative_text],
+                                            height=height, width=width, num_inference_steps=steps,
+                                            guidance_scale=guidance_weight)
+        print('time lapses to get attention maps: %.4f' % (time.time()-begin_time))
+        color_obj_masks = get_token_maps(
+            model.attention_maps, run_dir, width//8, height//8, color_target_token_ids, seed)
+        model.masks = get_token_maps(
+            model.attention_maps, run_dir, width//8, height//8, region_target_token_ids, seed, base_tokens)
+        color_obj_masks = [transforms.functional.resize(color_obj_mask, (height, width),
+                                                        interpolation=transforms.InterpolationMode.BICUBIC,
+                                                        antialias=True)
+                        for color_obj_mask in color_obj_masks]
+        text_format_dict['color_obj_atten'] = color_obj_masks
+        model.remove_evaluation_hooks()
+        # generate image from rich text
+        begin_time = time.time()
+        seed_everything(seed)
+        rich_img = model.prompt_to_img(region_text_prompts, [negative_text],
+                                    height=height, width=width, num_inference_steps=steps,
+                                    guidance_scale=guidance_weight, use_grad_guidance=use_grad_guidance,
+                                    text_format_dict=text_format_dict)
+        print('time lapses to generate image from rich text: %.4f' %
+            (time.time()-begin_time))
+        return [plain_img[0], rich_img[0]]
+    with gr.Blocks() as demo:
+        gr.HTML("""<h1 style="font-weight: 900; margin-bottom: 7px;">Expressive Text-to-Image Generation with Rich Text</h1>
+                   <p> Visit our <a href="https://rich-text-to-image.github.io/rich-text-to-json.html">rich-text-to-json interface</a> to generate rich-text JSON input.<p/>""")
+        with gr.Row():
+            with gr.Column():
+                text_input = gr.Textbox(
+                    label='Rich-text JSON Input',
+                    max_lines=1,
+                    placeholder='Example: \'{"ops":[{"insert":"a Gothic "},{"attributes":{"color":"#b26b00"},"insert":"church"},{"insert":" in a the sunset with a beautiful landscape in the background.\n"}]}\'')
+                negative_prompt = gr.Textbox(
+                    label='Negative Prompt',
+                    max_lines=1,
+                    placeholder='')
+                seed = gr.Slider(label='Seed',
+                                 minimum=0,
+                                 maximum=100000,
+                                 step=1,
+                                 value=6)
+                with gr.Accordion('Other Parameters', open=False):
+                    steps = gr.Slider(label='Number of Steps',
+                                          minimum=0,
+                                          maximum=500,
+                                          step=1,
+                                          value=41)
+                    guidance_weight = gr.Slider(label='CFG weight',
+                                               minimum=0,
+                                               maximum=50,
+                                               step=0.1,
+                                               value=8.5)
+                    width = gr.Dropdown(choices=[512, 768, 896],
+                                    value=512,
+                                    label='Width',
+                                    visible=True)
+                    height = gr.Dropdown(choices=[512, 768, 896],
+                                    value=512,
+                                    label='height',
+                                    visible=True)
+                with gr.Row():
+                    with gr.Column(scale=1, min_width=100):
+                        generate_button = gr.Button("Generate")
+            with gr.Column():
+                result = gr.Image(label='Result')
+                token_map = gr.Image(label='TokenMap')
+        with gr.Row():
+            examples = [
+                [
+                    '{"ops":[{"insert":"a Gothic "},{"attributes":{"color":"#b26b00"},"insert":"church"},{"insert":" in a the sunset with a beautiful landscape in the background.\n"}]}',
+                    '',
+                    512,
+                    512,
+                    6,
+                ],
+                [
+                    '{"ops": [{"insert": "A pizza with "}, {"attributes": {"size": "50px"}, "insert": "pineapples"}, {"insert": ", pepperonis, and mushrooms on the top, 4k, photorealistic\n"}]}',
+                    'blurry, art, painting, rendering, drawing, sketch, ugly, duplicate, morbid, mutilated, mutated, deformed, disfigured low quality, worst quality',
+                    768,
+                    896,
+                    6,
+                ],
+                [
+                    '{"ops":[{"insert":"a "},{"attributes":{"font":"mirza"},"insert":"beautiful garden"},{"insert":" with a "},{"attributes":{"font":"roboto"},"insert":"snow mountain in the background"},{"insert":"\n"}]}',
+                    '',
+                    512,
+                    512,
+                    3,
+                ],
+                [
+                    '{"ops":[{"insert":"A close-up 4k dslr photo of a "},{"attributes":{"link":"A cat wearing sunglasses and a bandana around its neck."},"insert":"cat"},{"insert":" riding a scooter. Palm trees in the background.\n"}]}',
+                    '',
+                    512,
+                    512,
+                    6,
+                ],
+            ]
+            gr.Examples(examples=examples,
+                        inputs=[
+                            text_input,
+                            negative_prompt,
+                            height,
+                            width,
+                            seed,
+                        ],
+                        outputs=[
+                            result,
+                            token_map,
+                        ],
+                        fn=generate,
+                        # cache_examples=True,
+                        examples_per_page=20)
+        generate_button.click(
+            fn=generate,
+            inputs=[
+                text_input,
+                negative_prompt,
+                height,
+                width,
+                seed,
+                steps,
+                guidance_weight,
+            ],
+            outputs=[result, token_map],
+        )
+    demo.queue(concurrency_count=1)
+    demo.launch(share=False)
+if __name__ == "__main__":
+    main()

models/attention.py ADDED Viewed

	@@ -0,0 +1,892 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.embeddings import ImagePositionalEmbeddings
+from diffusers.utils import BaseOutput
+from diffusers.utils.import_utils import is_xformers_available
+@dataclass
+class Transformer2DModelOutput(BaseOutput):
+    """
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            Hidden states conditioned on `encoder_hidden_states` input. If discrete, returns probability distributions
+            for the unnoised latent pixels.
+    """
+    sample: torch.FloatTensor
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+class Transformer2DModel(ModelMixin, ConfigMixin):
+    """
+    Transformer model for image-like data. Takes either discrete (classes of vector embeddings) or continuous (actual
+    embeddings) inputs.
+    When input is continuous: First, project the input (aka embedding) and reshape to b, t, d. Then apply standard
+    transformer action. Finally, reshape to image.
+    When input is discrete: First, input (classes of latent pixels) is converted to embeddings and has positional
+    embeddings applied, see `ImagePositionalEmbeddings`. Then apply standard transformer action. Finally, predict
+    classes of unnoised image.
+    Note that it is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised
+    image do not contain a prediction for the masked pixel as the unnoised image cannot be masked.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            Pass if the input is continuous. The number of channels in the input and output.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.1): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of context dimensions to use.
+        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+            `ImagePositionalEmbeddings`.
+        num_vector_embeds (`int`, *optional*):
+            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
+            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
+            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
+            up to but not more than steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the TransformerBlocks' attention should contain a bias parameter.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        # 1. Transformer2DModel can process both standard continous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = in_channels is not None
+        self.is_input_vectorized = num_vector_embeds is not None
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized:
+            raise ValueError(
+                f"Has to define either `in_channels`: {in_channels} or `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is not None."
+            )
+        # 2. Define input layers
+        if self.is_input_continuous:
+            self.in_channels = in_channels
+            self.norm = torch.nn.GroupNorm(
+                num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+            if use_linear_projection:
+                self.proj_in = nn.Linear(in_channels, inner_dim)
+            else:
+                self.proj_in = nn.Conv2d(
+                    in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
+            self.height = sample_size
+            self.width = sample_size
+            self.num_vector_embeds = num_vector_embeds
+            self.num_latent_pixels = self.height * self.width
+            self.latent_image_embedding = ImagePositionalEmbeddings(
+                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
+            )
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        # 4. Define output layers
+        if self.is_input_continuous:
+            if use_linear_projection:
+                self.proj_out = nn.Linear(in_channels, inner_dim)
+            else:
+                self.proj_out = nn.Conv2d(
+                    inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            self.norm_out = nn.LayerNorm(inner_dim)
+            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
+    def _set_attention_slice(self, slice_size):
+        for block in self.transformer_blocks:
+            block._set_attention_slice(slice_size)
+    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None,
+                text_format_dict={}, return_dict: bool = True):
+        """
+        Args:
+            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
+                When continous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
+                hidden_states
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, context dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.long`, *optional*):
+                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`]
+            if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample
+            tensor.
+        """
+        # 1. Input
+        if self.is_input_continuous:
+            batch, channel, height, weight = hidden_states.shape
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+            if not self.use_linear_projection:
+                hidden_states = self.proj_in(hidden_states)
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(
+                    0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+            else:
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(
+                    0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+                hidden_states = self.proj_in(hidden_states)
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states)
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states, context=encoder_hidden_states, timestep=timestep,
+                                  text_format_dict=text_format_dict)
+        # 3. Output
+        if self.is_input_continuous:
+            if not self.use_linear_projection:
+                hidden_states = (
+                    hidden_states.reshape(batch, height, weight, inner_dim).permute(
+                        0, 3, 1, 2).contiguous()
+                )
+                hidden_states = self.proj_out(hidden_states)
+            else:
+                hidden_states = self.proj_out(hidden_states)
+                hidden_states = (
+                    hidden_states.reshape(batch, height, weight, inner_dim).permute(
+                        0, 3, 1, 2).contiguous()
+                )
+            output = hidden_states + residual
+        elif self.is_input_vectorized:
+            hidden_states = self.norm_out(hidden_states)
+            logits = self.out(hidden_states)
+            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+            logits = logits.permute(0, 2, 1)
+            # log(p(x_0))
+            output = F.log_softmax(logits.double(), dim=1).float()
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)
+    def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for block in self.transformer_blocks:
+            block._set_use_memory_efficient_attention_xformers(
+                use_memory_efficient_attention_xformers)
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
+    to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    Uses three q, k, v linear layers to compute attention.
+    Parameters:
+        channels (`int`): The number of channels in the input and output.
+        num_head_channels (`int`, *optional*):
+            The number of channels in each head. If None, then `num_heads` = 1.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for group norm.
+        rescale_output_factor (`float`, *optional*, defaults to 1.0): The factor to rescale the output by.
+        eps (`float`, *optional*, defaults to 1e-5): The epsilon value to use for group norm.
+    """
+    def __init__(
+        self,
+        channels: int,
+        num_head_channels: Optional[int] = None,
+        norm_num_groups: int = 32,
+        rescale_output_factor: float = 1.0,
+        eps: float = 1e-5,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
+        self.num_head_size = num_head_channels
+        self.group_norm = nn.GroupNorm(
+            num_channels=channels, num_groups=norm_num_groups, eps=eps, affine=True)
+        # define q,k,v as linear layers
+        self.query = nn.Linear(channels, channels)
+        self.key = nn.Linear(channels, channels)
+        self.value = nn.Linear(channels, channels)
+        self.rescale_output_factor = rescale_output_factor
+        self.proj_attn = nn.Linear(channels, channels, 1)
+    def transpose_for_scores(self, projection: torch.Tensor) -> torch.Tensor:
+        new_projection_shape = projection.size()[:-1] + (self.num_heads, -1)
+        # move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
+        new_projection = projection.view(
+            new_projection_shape).permute(0, 2, 1, 3)
+        return new_projection
+    def forward(self, hidden_states):
+        residual = hidden_states
+        batch, channel, height, width = hidden_states.shape
+        # norm
+        hidden_states = self.group_norm(hidden_states)
+        hidden_states = hidden_states.view(
+            batch, channel, height * width).transpose(1, 2)
+        # proj to q, k, v
+        query_proj = self.query(hidden_states)
+        key_proj = self.key(hidden_states)
+        value_proj = self.value(hidden_states)
+        scale = 1 / math.sqrt(self.channels / self.num_heads)
+        # get scores
+        if self.num_heads > 1:
+            query_states = self.transpose_for_scores(query_proj)
+            key_states = self.transpose_for_scores(key_proj)
+            value_states = self.transpose_for_scores(value_proj)
+            # TODO: is there a way to perform batched matmul (e.g. baddbmm) on 4D tensors?
+            #       or reformulate this into a 3D problem?
+            # TODO: measure whether on MPS device it would be faster to do this matmul via einsum
+            #       as some matmuls can be 1.94x slower than an equivalent einsum on MPS
+            #       https://gist.github.com/Birch-san/cba16789ec27bb20996a4b4831b13ce0
+            attention_scores = torch.matmul(
+                query_states, key_states.transpose(-1, -2)) * scale
+        else:
+            query_states, key_states, value_states = query_proj, key_proj, value_proj
+            attention_scores = torch.baddbmm(
+                torch.empty(
+                    query_states.shape[0],
+                    query_states.shape[1],
+                    key_states.shape[1],
+                    dtype=query_states.dtype,
+                    device=query_states.device,
+                ),
+                query_states,
+                key_states.transpose(-1, -2),
+                beta=0,
+                alpha=scale,
+            )
+        attention_probs = torch.softmax(
+            attention_scores.float(), dim=-1).type(attention_scores.dtype)
+        # compute attention output
+        if self.num_heads > 1:
+            # TODO: is there a way to perform batched matmul (e.g. bmm) on 4D tensors?
+            #       or reformulate this into a 3D problem?
+            # TODO: measure whether on MPS device it would be faster to do this matmul via einsum
+            #       as some matmuls can be 1.94x slower than an equivalent einsum on MPS
+            #       https://gist.github.com/Birch-san/cba16789ec27bb20996a4b4831b13ce0
+            hidden_states = torch.matmul(attention_probs, value_states)
+            hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous()
+            new_hidden_states_shape = hidden_states.size()[
+                :-2] + (self.channels,)
+            hidden_states = hidden_states.view(new_hidden_states_shape)
+        else:
+            hidden_states = torch.bmm(attention_probs, value_states)
+        # compute next hidden_states
+        hidden_states = self.proj_attn(hidden_states)
+        hidden_states = hidden_states.transpose(
+            -1, -2).reshape(batch, channel, height, width)
+        # res connect and rescale
+        hidden_states = (hidden_states + residual) / self.rescale_output_factor
+        return hidden_states
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the context vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.attn1 = CrossAttention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+        )  # is a self-attention
+        self.ff = FeedForward(dim, dropout=dropout,
+                              activation_fn=activation_fn)
+        self.attn2 = CrossAttention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+        )  # is self-attn if context is none
+        # layer norms
+        self.use_ada_layer_norm = num_embeds_ada_norm is not None
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+            self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim)
+            self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        # if xformers is installed try to use memory_efficient_attention by default
+        if is_xformers_available():
+            try:
+                self._set_use_memory_efficient_attention_xformers(True)
+            except Exception as e:
+                warnings.warn(
+                    "Could not enable memory efficient attention. Make sure xformers is installed"
+                    f" correctly and a GPU is available: {e}"
+                )
+    def _set_attention_slice(self, slice_size):
+        self.attn1._slice_size = slice_size
+        self.attn2._slice_size = slice_size
+    def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        if not is_xformers_available():
+            print("Here is how to install it")
+            raise ModuleNotFoundError(
+                "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                " xformers",
+                name="xformers",
+            )
+        elif not torch.cuda.is_available():
+            raise ValueError(
+                "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only"
+                " available for GPU "
+            )
+        else:
+            try:
+                # Make sure we can run the memory efficient attention
+                _ = xformers.ops.memory_efficient_attention(
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                )
+            except Exception as e:
+                raise e
+            self.attn1._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+            self.attn2._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+    def forward(self, hidden_states, context=None, timestep=None, text_format_dict={}):
+        # 1. Self-Attention
+        norm_hidden_states = (
+            self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(
+                hidden_states)
+        )
+        if self.only_cross_attention:
+            attn_out, _ = self.attn1(
+                norm_hidden_states, context, text_format_dict=text_format_dict) + hidden_states
+            hidden_states = attn_out + hidden_states
+        else:
+            attn_out, _ = self.attn1(norm_hidden_states)
+            hidden_states = attn_out + hidden_states
+        # 2. Cross-Attention
+        norm_hidden_states = (
+            self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(
+                hidden_states)
+        )
+        attn_out, _ = self.attn2(
+            norm_hidden_states, context=context, text_format_dict=text_format_dict)
+        hidden_states = attn_out + hidden_states
+        # 3. Feed-forward
+        hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
+        return hidden_states
+class CrossAttention(nn.Module):
+    r"""
+    A cross attention layer.
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the context. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
+        dim_head (`int`,  *optional*, defaults to 64): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias=False,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        self.is_cross_attn = cross_attention_dim is not None
+        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self._slice_size = None
+        self._use_memory_efficient_attention_xformers = False
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
+        self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+        self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(inner_dim, query_dim))
+        self.to_out.append(nn.Dropout(dropout))
+    def reshape_heads_to_batch_dim(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size, seq_len,
+                                head_size, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(
+            batch_size * head_size, seq_len, dim // head_size)
+        return tensor
+    def reshape_batch_dim_to_heads(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size // head_size,
+                                head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(
+            batch_size // head_size, seq_len, dim * head_size)
+        return tensor
+    def reshape_batch_dim_to_heads_and_average(self, tensor):
+        batch_size, seq_len, seq_len2 = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size // head_size,
+                                head_size, seq_len, seq_len2)
+        return tensor.mean(1)
+    def forward(self, hidden_states, context=None, mask=None, text_format_dict={}):
+        batch_size, sequence_length, _ = hidden_states.shape
+        query = self.to_q(hidden_states)
+        context = context if context is not None else hidden_states
+        key = self.to_k(context)
+        value = self.to_v(context)
+        dim = query.shape[-1]
+        query = self.reshape_heads_to_batch_dim(query)
+        key = self.reshape_heads_to_batch_dim(key)
+        value = self.reshape_heads_to_batch_dim(value)
+        # attention, what we cannot get enough of
+        if self._use_memory_efficient_attention_xformers:
+            hidden_states = self._memory_efficient_attention_xformers(
+                query, key, value)
+            # Some versions of xformers return output in fp32, cast it back to the dtype of the input
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
+                # only this attention function is used
+                hidden_states, attn_probs = self._attention(
+                    query, key, value, **text_format_dict)
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        return hidden_states, attn_probs
+    def _qk(self, query, key):
+        return torch.baddbmm(
+            torch.empty(query.shape[0], query.shape[1], key.shape[1],
+                        dtype=query.dtype, device=query.device),
+            query,
+            key.transpose(-1, -2),
+            beta=0,
+            alpha=self.scale,
+        )
+    def _attention(self, query, key, value, word_pos=None, font_size=None,
+                   **kwargs):
+        attention_scores = self._qk(query, key)
+        # Font size:
+        if self.is_cross_attn and word_pos is not None and font_size is not None:
+            assert key.shape[1] == 77
+            attention_score_exp = attention_scores.exp()
+            font_size_abs, font_size_sign = font_size.abs(), font_size.sign()
+            attention_score_exp[:, :, word_pos] = attention_score_exp[:, :, word_pos].clone(
+            )*font_size_abs
+            attention_probs = attention_score_exp / \
+                attention_score_exp.sum(-1, True)
+            attention_probs[:, :, word_pos] *= font_size_sign
+        else:
+            attention_probs = attention_scores.softmax(dim=-1)
+        hidden_states = torch.bmm(attention_probs, value)
+        # reshape hidden_states
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        attention_probs = self.reshape_batch_dim_to_heads_and_average(
+            attention_probs)
+        return hidden_states, attention_probs
+    def _memory_efficient_attention_xformers(self, query, key, value):
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=None)
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        return hidden_states
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "geglu":
+            geglu = GEGLU(dim, inner_dim)
+        elif activation_fn == "geglu-approximate":
+            geglu = ApproximateGELU(dim, inner_dim)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(geglu)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out))
+    def forward(self, hidden_states):
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+# feedforward
+class GEGLU(nn.Module):
+    r"""
+    A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def gelu(self, gate):
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
+        return hidden_states * self.gelu(gate)
+class ApproximateGELU(nn.Module):
+    """
+    The approximate form of Gaussian Error Linear Unit (GELU)
+    For more details, see section 2: https://arxiv.org/abs/1606.08415
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out)
+    def forward(self, x):
+        x = self.proj(x)
+        return x * torch.sigmoid(1.702 * x)
+class AdaLayerNorm(nn.Module):
+    """
+    Norm layer modified to incorporate timestep embeddings.
+    """
+    def __init__(self, embedding_dim, num_embeddings):
+        super().__init__()
+        self.emb = nn.Embedding(num_embeddings, embedding_dim)
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
+        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False)
+    def forward(self, x, timestep):
+        emb = self.linear(self.silu(self.emb(timestep)))
+        scale, shift = torch.chunk(emb, 2)
+        x = self.norm(x) * (1 + scale) + shift
+        return x
+class DualTransformer2DModel(nn.Module):
+    """
+    Dual transformer wrapper that combines two `Transformer2DModel`s for mixed inference.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            Pass if the input is continuous. The number of channels in the input and output.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.1): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of context dimensions to use.
+        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+            `ImagePositionalEmbeddings`.
+        num_vector_embeds (`int`, *optional*):
+            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
+            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
+            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
+            up to but not more than steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the TransformerBlocks' attention should contain a bias parameter.
+    """
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+    ):
+        super().__init__()
+        self.transformers = nn.ModuleList(
+            [
+                Transformer2DModel(
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    in_channels=in_channels,
+                    num_layers=num_layers,
+                    dropout=dropout,
+                    norm_num_groups=norm_num_groups,
+                    cross_attention_dim=cross_attention_dim,
+                    attention_bias=attention_bias,
+                    sample_size=sample_size,
+                    num_vector_embeds=num_vector_embeds,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                )
+                for _ in range(2)
+            ]
+        )
+        # Variables that can be set by a pipeline:
+        # The ratio of transformer1 to transformer2's output states to be combined during inference
+        self.mix_ratio = 0.5
+        # The shape of `encoder_hidden_states` is expected to be
+        # `(batch_size, condition_lengths[0]+condition_lengths[1], num_features)`
+        self.condition_lengths = [77, 257]
+        # Which transformer to use to encode which condition.
+        # E.g. `(1, 0)` means that we'll use `transformers[1](conditions[0])` and `transformers[0](conditions[1])`
+        self.transformer_index_for_condition = [1, 0]
+    def forward(self, hidden_states, encoder_hidden_states, timestep=None, return_dict: bool = True):
+        """
+        Args:
+            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
+                When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
+                hidden_states
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, context dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.long`, *optional*):
+                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`]
+            if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample
+            tensor.
+        """
+        input_states = hidden_states
+        encoded_states = []
+        tokens_start = 0
+        for i in range(2):
+            # for each of the two transformers, pass the corresponding condition tokens
+            condition_state = encoder_hidden_states[:,
+                                                    tokens_start: tokens_start + self.condition_lengths[i]]
+            transformer_index = self.transformer_index_for_condition[i]
+            encoded_state = self.transformers[transformer_index](input_states, condition_state, timestep, return_dict)[
+                0
+            ]
+            encoded_states.append(encoded_state - input_states)
+            tokens_start += self.condition_lengths[i]
+        output_states = encoded_states[0] * self.mix_ratio + \
+            encoded_states[1] * (1 - self.mix_ratio)
+        output_states = output_states + input_states
+        if not return_dict:
+            return (output_states,)
+        return Transformer2DModelOutput(sample=output_states)
+    def _set_attention_slice(self, slice_size):
+        for transformer in self.transformers:
+            transformer._set_attention_slice(slice_size)
+    def _set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for transformer in self.transformers:
+            transformer._set_use_memory_efficient_attention_xformers(
+                use_memory_efficient_attention_xformers)

models/region_diffusion.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import os
+import torch
+import collections
+import torch.nn as nn
+from functools import partial
+from transformers import CLIPTextModel, CLIPTokenizer, logging
+from diffusers import AutoencoderKL, PNDMScheduler, EulerDiscreteScheduler, DPMSolverMultistepScheduler
+from models.unet_2d_condition import UNet2DConditionModel
+# suppress partial model loading warning
+logging.set_verbosity_error()
+class RegionDiffusion(nn.Module):
+    def __init__(self, device):
+        super().__init__()
+        try:
+            with open('./TOKEN', 'r') as f:
+                self.token = f.read().replace('\n', '')  # remove the last \n!
+                print(f'[INFO] loaded hugging face access token from ./TOKEN!')
+        except FileNotFoundError as e:
+            self.token = True
+            print(f'[INFO] try to load hugging face access token from the default place, make sure you have run `huggingface-cli login`.')
+        self.device = device
+        self.num_train_timesteps = 1000
+        self.clip_gradient = False
+        print(f'[INFO] loading stable diffusion...')
+        local_pretrained_dir = f'pretrained-guidance/v1'
+        if not os.path.isdir(local_pretrained_dir):
+            save_pretrained = True
+            load_paths = 'runwayml/stable-diffusion-v1-5'
+            os.makedirs(local_pretrained_dir, exist_ok=True)
+        else:
+            save_pretrained = False
+            load_paths = local_pretrained_dir
+        # 1. Load the autoencoder model which will be used to decode the latents into image space.
+        self.vae = AutoencoderKL.from_pretrained(
+            load_paths, subfolder="vae", use_auth_token=self.token).to(self.device)
+        # 2. Load the tokenizer and text encoder to tokenize and encode the text.
+        self.tokenizer = CLIPTokenizer.from_pretrained(
+            load_paths, subfolder='tokenizer', use_auth_token=self.token)
+        self.text_encoder = CLIPTextModel.from_pretrained(
+            load_paths, subfolder='text_encoder', use_auth_token=self.token).to(self.device)
+        # 3. The UNet model for generating the latents.
+        self.unet = UNet2DConditionModel.from_pretrained(
+            load_paths, subfolder="unet", use_auth_token=self.token).to(self.device)
+        if save_pretrained:
+            self.vae.save_pretrained(os.path.join(local_pretrained_dir, 'vae'))
+            self.tokenizer.save_pretrained(
+                os.path.join(local_pretrained_dir, 'tokenizer'))
+            self.text_encoder.save_pretrained(
+                os.path.join(local_pretrained_dir, 'text_encoder'))
+            self.unet.save_pretrained(
+                os.path.join(local_pretrained_dir, 'unet'))
+        self.scheduler = PNDMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear",
+                                       num_train_timesteps=self.num_train_timesteps, skip_prk_steps=True, steps_offset=1)
+        self.alphas_cumprod = self.scheduler.alphas_cumprod.to(self.device)
+        self.masks = []
+        self.attention_maps = None
+        self.color_loss = torch.nn.functional.mse_loss
+        print(f'[INFO] loaded stable diffusion!')
+    def get_text_embeds(self, prompt, negative_prompt):
+        # prompt, negative_prompt: [str]
+        # Tokenize text and get embeddings
+        text_input = self.tokenizer(
+            prompt, padding='max_length', max_length=self.tokenizer.model_max_length, truncation=True, return_tensors='pt')
+        with torch.no_grad():
+            text_embeddings = self.text_encoder(
+                text_input.input_ids.to(self.device))[0]
+        # Do the same for unconditional embeddings
+        uncond_input = self.tokenizer(negative_prompt, padding='max_length',
+                                      max_length=self.tokenizer.model_max_length, return_tensors='pt')
+        with torch.no_grad():
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(self.device))[0]
+        # Cat for final embeddings
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        return text_embeddings
+    def get_text_embeds_list(self, prompts):
+        # prompts: [list]
+        text_embeddings = []
+        for prompt in prompts:
+            # Tokenize text and get embeddings
+            text_input = self.tokenizer(
+                [prompt], padding='max_length', max_length=self.tokenizer.model_max_length, truncation=True, return_tensors='pt')
+            with torch.no_grad():
+                text_embeddings.append(self.text_encoder(
+                    text_input.input_ids.to(self.device))[0])
+        return text_embeddings
+    def produce_latents(self, text_embeddings, height=512, width=512, num_inference_steps=50, guidance_scale=7.5,
+                        latents=None, use_grad_guidance=False, text_format_dict={}):
+        if latents is None:
+            latents = torch.randn(
+                (1, self.unet.in_channels, height // 8, width // 8), device=self.device)
+        self.scheduler.set_timesteps(num_inference_steps)
+        n_styles = text_embeddings.shape[0]-1
+        assert n_styles == len(self.masks)
+        with torch.autocast('cuda'):
+            for i, t in enumerate(self.scheduler.timesteps):
+                # predict the noise residual
+                with torch.no_grad():
+                    noise_pred_uncond = self.unet(latents, t, encoder_hidden_states=text_embeddings[:1],
+                                                  text_format_dict={})['sample']
+                    noise_pred_text = None
+                    for style_i, mask in enumerate(self.masks):
+                        if style_i < len(self.masks) - 1:
+                            masked_latent = latents
+                            noise_pred_text_cur = self.unet(masked_latent, t, encoder_hidden_states=text_embeddings[style_i+1:style_i+2],
+                                                            text_format_dict={})['sample']
+                        else:
+                            noise_pred_text_cur = self.unet(latents, t, encoder_hidden_states=text_embeddings[style_i+1:style_i+2],
+                                                            text_format_dict=text_format_dict)['sample']
+                        if noise_pred_text is None:
+                            noise_pred_text = noise_pred_text_cur * mask
+                        else:
+                            noise_pred_text = noise_pred_text + noise_pred_text_cur*mask
+                # perform classifier-free guidance
+                noise_pred = noise_pred_uncond + guidance_scale * \
+                    (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents)[
+                    'prev_sample']
+                # apply gradient guidance
+                if use_grad_guidance and t < text_format_dict['guidance_start_step']:
+                    with torch.enable_grad():
+                        if not latents.requires_grad:
+                            latents.requires_grad = True
+                        latents_0 = self.predict_x0(latents, noise_pred, t)
+                        latents_inp = 1 / 0.18215 * latents_0
+                        imgs = self.vae.decode(latents_inp).sample
+                        imgs = (imgs / 2 + 0.5).clamp(0, 1)
+                        loss_total = 0.
+                        for attn_map, rgb_val in zip(text_format_dict['color_obj_atten'], text_format_dict['target_RGB']):
+                            avg_rgb = (
+                                imgs*attn_map[:, 0]).sum(2).sum(2)/attn_map[:, 0].sum()
+                            loss = self.color_loss(
+                                avg_rgb, rgb_val[:, :, 0, 0])*100
+                            # print(loss)
+                            loss_total += loss
+                        loss_total.backward()
+                    latents = (
+                        latents - latents.grad * text_format_dict['color_guidance_weight']).detach().clone()
+        return latents
+    def predict_x0(self, x_t, eps_t, t):
+        alpha_t = self.scheduler.alphas_cumprod[t]
+        return (x_t - eps_t * torch.sqrt(1-alpha_t)) / torch.sqrt(alpha_t)
+    def produce_attn_maps(self, prompts, negative_prompts='', height=512, width=512, num_inference_steps=50,
+                          guidance_scale=7.5, latents=None):
+        if isinstance(prompts, str):
+            prompts = [prompts]
+        if isinstance(negative_prompts, str):
+            negative_prompts = [negative_prompts]
+        # Prompts -> text embeds
+        text_embeddings = self.get_text_embeds(
+            prompts, negative_prompts)  # [2, 77, 768]
+        if latents is None:
+            latents = torch.randn(
+                (text_embeddings.shape[0] // 2, self.unet.in_channels, height // 8, width // 8), device=self.device)
+        self.scheduler.set_timesteps(num_inference_steps)
+        with torch.autocast('cuda'):
+            for i, t in enumerate(self.scheduler.timesteps):
+                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+                latent_model_input = torch.cat([latents] * 2)
+                # predict the noise residual
+                with torch.no_grad():
+                    noise_pred = self.unet(
+                        latent_model_input, t, encoder_hidden_states=text_embeddings)['sample']
+                # perform guidance
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * \
+                    (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents)[
+                    'prev_sample']
+        # Img latents -> imgs
+        imgs = self.decode_latents(latents)  # [1, 3, 512, 512]
+        # Img to Numpy
+        imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy()
+        imgs = (imgs * 255).round().astype('uint8')
+        return imgs
+    def decode_latents(self, latents):
+        latents = 1 / 0.18215 * latents
+        with torch.no_grad():
+            imgs = self.vae.decode(latents).sample
+        imgs = (imgs / 2 + 0.5).clamp(0, 1)
+        return imgs
+    def prompt_to_img(self, prompts, negative_prompts='', height=512, width=512, num_inference_steps=50,
+                      guidance_scale=7.5, latents=None, text_format_dict={}, use_grad_guidance=False):
+        if isinstance(prompts, str):
+            prompts = [prompts]
+        if isinstance(negative_prompts, str):
+            negative_prompts = [negative_prompts]
+        # Prompts -> text embeds
+        text_embeds = self.get_text_embeds(
+            prompts, negative_prompts)  # [2, 77, 768]
+        if len(text_format_dict) > 0:
+            if 'font_styles' in text_format_dict and text_format_dict['font_styles'] is not None:
+                text_format_dict['font_styles_embs'] = self.get_text_embeds_list(
+                    text_format_dict['font_styles'])  # [2, 77, 768]
+            else:
+                text_format_dict['font_styles_embs'] = None
+        # else:
+        latents = self.produce_latents(text_embeds, height=height, width=width, latents=latents,
+                                       num_inference_steps=num_inference_steps, guidance_scale=guidance_scale,
+                                       use_grad_guidance=use_grad_guidance, text_format_dict=text_format_dict)  # [1, 4, 64, 64]
+        # Img latents -> imgs
+        imgs = self.decode_latents(latents)  # [1, 3, 512, 512]
+        # Img to Numpy
+        imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy()
+        imgs = (imgs * 255).round().astype('uint8')
+        return imgs
+    def reset_attention_maps(self):
+        r"""Function to reset attention maps.
+        We reset attention maps because we append them while getting hooks
+        to visualize attention maps for every step.
+        """
+        for key in self.attention_maps:
+            self.attention_maps[key] = []
+    def register_evaluation_hooks(self):
+        r"""Function for registering hooks during evaluation.
+        We mainly store activation maps averaged over queries.
+        """
+        self.forward_hooks = []
+        def save_activations(activations, name, module, inp, out):
+            r"""
+            PyTorch Forward hook to save outputs at each forward pass.
+            """
+            # out[0] - final output of attention layer
+            # out[1] - attention probability matrix
+            if 'attn2' in name:
+                assert out[1].shape[-1] == 77
+                activations[name].append(out[1].detach().cpu())
+            else:
+                assert out[1].shape[-1] != 77
+        attention_dict = collections.defaultdict(list)
+        for name, module in self.unet.named_modules():
+            leaf_name = name.split('.')[-1]
+            if 'attn' in leaf_name:
+                # Register hook to obtain outputs at every attention layer.
+                self.forward_hooks.append(module.register_forward_hook(
+                    partial(save_activations, attention_dict, name)
+                ))
+        # attention_dict is a dictionary containing attention maps for every attention layer
+        self.attention_maps = attention_dict
+    def remove_evaluation_hooks(self):
+        for hook in self.forward_hooks:
+            hook.remove()
+        self.attention_maps = None

models/unet_2d_blocks.py ADDED Viewed

	@@ -0,0 +1,1670 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import torch
+from torch import nn
+from .attention import AttentionBlock, DualTransformer2DModel, Transformer2DModel
+from diffusers.models.resnet import Downsample2D, FirDownsample2D, FirUpsample2D, ResnetBlock2D, Upsample2D
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+):
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+        )
+    elif down_block_type == "AttnDownBlock2D":
+        return AttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
+        return CrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+        )
+    elif down_block_type == "SkipDownBlock2D":
+        return SkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+        )
+    elif down_block_type == "AttnSkipDownBlock2D":
+        return AttnSkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif down_block_type == "DownEncoderBlock2D":
+        return DownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+        )
+    elif down_block_type == "AttnDownEncoderBlock2D":
+        return AttnDownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+):
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+        )
+    elif up_block_type == "CrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
+        return CrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+        )
+    elif up_block_type == "AttnUpBlock2D":
+        return AttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "SkipUpBlock2D":
+        return SkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif up_block_type == "AttnSkipUpBlock2D":
+        return AttnSkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    elif up_block_type == "UpDecoderBlock2D":
+        return UpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+        )
+    elif up_block_type == "AttnUpDecoderBlock2D":
+        return AttnUpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attn_num_head_channels=attn_num_head_channels,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+class UNetMidBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=1.0,
+    ):
+        super().__init__()
+        self.attention_type = attention_type
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        for _ in range(num_layers):
+            attentions.append(
+                AttentionBlock(
+                    in_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+    def forward(self, hidden_states, temb=None, encoder_states=None):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if self.attention_type == "default":
+                hidden_states = attn(hidden_states)
+            else:
+                hidden_states = attn(hidden_states, encoder_states)
+            hidden_states = resnet(hidden_states, temb)
+        return hidden_states
+class UNetMidBlock2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+    ):
+        super().__init__()
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        for _ in range(num_layers):
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        attn_num_head_channels,
+                        in_channels // attn_num_head_channels,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        attn_num_head_channels,
+                        in_channels // attn_num_head_channels,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+    def set_attention_slice(self, slice_size):
+        head_dims = self.attn_num_head_channels
+        head_dims = [head_dims] if isinstance(head_dims, int) else head_dims
+        if slice_size is not None and any(dim % slice_size != 0 for dim in head_dims):
+            raise ValueError(
+                f"Make sure slice_size {slice_size} is a common divisor of "
+                f"the number of heads used in cross_attention: {head_dims}"
+            )
+        if slice_size is not None and slice_size > min(head_dims):
+            raise ValueError(
+                f"slice_size {slice_size} has to be smaller or equal to "
+                f"the lowest number of heads used in cross_attention: min({head_dims}) = {min(head_dims)}"
+            )
+        for attn in self.attentions:
+            attn._set_attention_slice(slice_size)
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for attn in self.attentions:
+            attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None,
+                text_format_dict={}):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states, encoder_hidden_states,
+                                 text_format_dict).sample
+            hidden_states = resnet(hidden_states, temb)
+        return hidden_states
+class AttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.attention_type = attention_type
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                AttentionBlock(
+                    out_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+    def forward(self, hidden_states, temb=None):
+        output_states = ()
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class CrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        attention_type="default",
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def set_attention_slice(self, slice_size):
+        head_dims = self.attn_num_head_channels
+        head_dims = [head_dims] if isinstance(head_dims, int) else head_dims
+        if slice_size is not None and any(dim % slice_size != 0 for dim in head_dims):
+            raise ValueError(
+                f"Make sure slice_size {slice_size} is a common divisor of "
+                f"the number of heads used in cross_attention: {head_dims}"
+            )
+        if slice_size is not None and slice_size > min(head_dims):
+            raise ValueError(
+                f"slice_size {slice_size} has to be smaller or equal to "
+                f"the lowest number of heads used in cross_attention: min({head_dims}) = {min(head_dims)}"
+            )
+        for attn in self.attentions:
+            attn._set_attention_slice(slice_size)
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for attn in self.attentions:
+            attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None,
+                text_format_dict={}):
+        output_states = ()
+        for resnet, attn in zip(self.resnets, self.attentions):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states,
+                                          text_format_dict
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states,
+                                     text_format_dict=text_format_dict).sample
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class DownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, temb=None):
+        output_states = ()
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+            else:
+                hidden_states = resnet(hidden_states, temb)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class DownEncoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+    def forward(self, hidden_states):
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+        return hidden_states
+class AttnDownEncoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                AttentionBlock(
+                    out_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+    def forward(self, hidden_states):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb=None)
+            hidden_states = attn(hidden_states)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+        return hidden_states
+class AttnSkipDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=np.sqrt(2.0),
+        downsample_padding=1,
+        add_downsample=True,
+    ):
+        super().__init__()
+        self.attentions = nn.ModuleList([])
+        self.resnets = nn.ModuleList([])
+        self.attention_type = attention_type
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(in_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            self.attentions.append(
+                AttentionBlock(
+                    out_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                )
+            )
+        if add_downsample:
+            self.resnet_down = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                down=True,
+                kernel="fir",
+            )
+            self.downsamplers = nn.ModuleList([FirDownsample2D(out_channels, out_channels=out_channels)])
+            self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+        else:
+            self.resnet_down = None
+            self.downsamplers = None
+            self.skip_conv = None
+    def forward(self, hidden_states, temb=None, skip_sample=None):
+        output_states = ()
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            hidden_states = self.resnet_down(hidden_states, temb)
+            for downsampler in self.downsamplers:
+                skip_sample = downsampler(skip_sample)
+            hidden_states = self.skip_conv(skip_sample) + hidden_states
+            output_states += (hidden_states,)
+        return hidden_states, output_states, skip_sample
+class SkipDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        output_scale_factor=np.sqrt(2.0),
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        self.resnets = nn.ModuleList([])
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(in_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        if add_downsample:
+            self.resnet_down = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                down=True,
+                kernel="fir",
+            )
+            self.downsamplers = nn.ModuleList([FirDownsample2D(out_channels, out_channels=out_channels)])
+            self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+        else:
+            self.resnet_down = None
+            self.downsamplers = None
+            self.skip_conv = None
+    def forward(self, hidden_states, temb=None, skip_sample=None):
+        output_states = ()
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            hidden_states = self.resnet_down(hidden_states, temb)
+            for downsampler in self.downsamplers:
+                skip_sample = downsampler(skip_sample)
+            hidden_states = self.skip_conv(skip_sample) + hidden_states
+            output_states += (hidden_states,)
+        return hidden_states, output_states, skip_sample
+class AttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_type="default",
+        attn_num_head_channels=1,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.attention_type = attention_type
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                AttentionBlock(
+                    out_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+        return hidden_states
+class CrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        attention_type="default",
+        output_scale_factor=1.0,
+        add_upsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.attention_type = attention_type
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def set_attention_slice(self, slice_size):
+        head_dims = self.attn_num_head_channels
+        head_dims = [head_dims] if isinstance(head_dims, int) else head_dims
+        if slice_size is not None and any(dim % slice_size != 0 for dim in head_dims):
+            raise ValueError(
+                f"Make sure slice_size {slice_size} is a common divisor of "
+                f"the number of heads used in cross_attention: {head_dims}"
+            )
+        if slice_size is not None and slice_size > min(head_dims):
+            raise ValueError(
+                f"slice_size {slice_size} has to be smaller or equal to "
+                f"the lowest number of heads used in cross_attention: min({head_dims}) = {min(head_dims)}"
+            )
+        for attn in self.attentions:
+            attn._set_attention_slice(slice_size)
+        self.gradient_checkpointing = False
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for attn in self.attentions:
+            attn._set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+        upsample_size=None,
+        text_format_dict={}
+    ):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states,
+                                          text_format_dict
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states,
+                                     text_format_dict=text_format_dict).sample
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+class UpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+            else:
+                hidden_states = resnet(hidden_states, temb)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+class UpDecoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+    def forward(self, hidden_states):
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+        return hidden_states
+class AttnUpDecoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                AttentionBlock(
+                    out_channels,
+                    num_head_channels=attn_num_head_channels,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+    def forward(self, hidden_states):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb=None)
+            hidden_states = attn(hidden_states)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+        return hidden_states
+class AttnSkipUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        attention_type="default",
+        output_scale_factor=np.sqrt(2.0),
+        upsample_padding=1,
+        add_upsample=True,
+    ):
+        super().__init__()
+        self.attentions = nn.ModuleList([])
+        self.resnets = nn.ModuleList([])
+        self.attention_type = attention_type
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(resnet_in_channels + res_skip_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions.append(
+            AttentionBlock(
+                out_channels,
+                num_head_channels=attn_num_head_channels,
+                rescale_output_factor=output_scale_factor,
+                eps=resnet_eps,
+            )
+        )
+        self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
+        if add_upsample:
+            self.resnet_up = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                groups_out=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                up=True,
+                kernel="fir",
+            )
+            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            self.skip_norm = torch.nn.GroupNorm(
+                num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
+            )
+            self.act = nn.SiLU()
+        else:
+            self.resnet_up = None
+            self.skip_conv = None
+            self.skip_norm = None
+            self.act = None
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            hidden_states = resnet(hidden_states, temb)
+        hidden_states = self.attentions[0](hidden_states)
+        if skip_sample is not None:
+            skip_sample = self.upsampler(skip_sample)
+        else:
+            skip_sample = 0
+        if self.resnet_up is not None:
+            skip_sample_states = self.skip_norm(hidden_states)
+            skip_sample_states = self.act(skip_sample_states)
+            skip_sample_states = self.skip_conv(skip_sample_states)
+            skip_sample = skip_sample + skip_sample_states
+            hidden_states = self.resnet_up(hidden_states, temb)
+        return hidden_states, skip_sample
+class SkipUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        output_scale_factor=np.sqrt(2.0),
+        add_upsample=True,
+        upsample_padding=1,
+    ):
+        super().__init__()
+        self.resnets = nn.ModuleList([])
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min((resnet_in_channels + res_skip_channels) // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
+        if add_upsample:
+            self.resnet_up = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                groups_out=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                up=True,
+                kernel="fir",
+            )
+            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            self.skip_norm = torch.nn.GroupNorm(
+                num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
+            )
+            self.act = nn.SiLU()
+        else:
+            self.resnet_up = None
+            self.skip_conv = None
+            self.skip_norm = None
+            self.act = None
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            hidden_states = resnet(hidden_states, temb)
+        if skip_sample is not None:
+            skip_sample = self.upsampler(skip_sample)
+        else:
+            skip_sample = 0
+        if self.resnet_up is not None:
+            skip_sample_states = self.skip_norm(hidden_states)
+            skip_sample_states = self.act(skip_sample_states)
+            skip_sample_states = self.skip_conv(skip_sample_states)
+            skip_sample = skip_sample + skip_sample_states
+            hidden_states = self.resnet_up(hidden_states, temb)
+        return hidden_states, skip_sample

models/unet_2d_condition.py ADDED Viewed

	@@ -0,0 +1,411 @@

+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from .unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    CrossAttnUpBlock2D,
+    DownBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UpBlock2D,
+    get_down_block,
+    get_up_block,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNet2DConditionOutput(BaseOutput):
+    """
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: torch.FloatTensor
+class UNet2DConditionModel(ModelMixin, ConfigMixin):
+    r"""
+    UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
+    and returns sample shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the models (such as downloading or saving, etc.)
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        num_class_embeds: Optional[int] = None,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+        # import ipdb;ipdb.set_trace()
+        # input
+        self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
+        # time
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        # class embedding
+        if num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        self.mid_block = UNetMidBlock2DCrossAttn(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift="default",
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attention_head_dim[-1],
+            resnet_groups=norm_num_groups,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+        )
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=reversed_attention_head_dim[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
+    def set_attention_slice(self, slice_size):
+        head_dims = self.config.attention_head_dim
+        head_dims = [head_dims] if isinstance(head_dims, int) else head_dims
+        if slice_size is not None and any(dim % slice_size != 0 for dim in head_dims):
+            raise ValueError(
+                f"Make sure slice_size {slice_size} is a common divisor of "
+                f"the number of heads used in cross_attention: {head_dims}"
+            )
+        if slice_size is not None and slice_size > min(head_dims):
+            raise ValueError(
+                f"slice_size {slice_size} has to be smaller or equal to "
+                f"the lowest number of heads used in cross_attention: min({head_dims}) = {min(head_dims)}"
+            )
+        for block in self.down_blocks:
+            if hasattr(block, "attentions") and block.attentions is not None:
+                block.set_attention_slice(slice_size)
+        self.mid_block.set_attention_slice(slice_size)
+        for block in self.up_blocks:
+            if hasattr(block, "attentions") and block.attentions is not None:
+                block.set_attention_slice(slice_size)
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        for block in self.down_blocks:
+            if hasattr(block, "attentions") and block.attentions is not None:
+                block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+        self.mid_block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+        for block in self.up_blocks:
+            if hasattr(block, "attentions") and block.attentions is not None:
+                block.set_use_memory_efficient_attention_xformers(use_memory_efficient_attention_xformers)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        text_format_dict = {},
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, channel, height, width) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb)
+        if self.config.num_class_embeds is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "attentions") and downsample_block.attentions is not None:
+                if isinstance(downsample_block, CrossAttnDownBlock2D):
+                    sample, res_samples = downsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        encoder_hidden_states=encoder_hidden_states,
+                        text_format_dict=text_format_dict
+                    )
+                else:
+                    sample, res_samples = downsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        encoder_hidden_states=encoder_hidden_states,
+                    )
+            else:
+                if isinstance(downsample_block, CrossAttnDownBlock2D):
+                    import ipdb;ipdb.set_trace()
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            down_block_res_samples += res_samples
+        # 4. mid
+        sample = self.mid_block(sample, emb, encoder_hidden_states=encoder_hidden_states,
+                                text_format_dict=text_format_dict)
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "attentions") and upsample_block.attentions is not None:
+                if isinstance(upsample_block, CrossAttnUpBlock2D):
+                    sample = upsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        res_hidden_states_tuple=res_samples,
+                        encoder_hidden_states=encoder_hidden_states,
+                        upsample_size=upsample_size,
+                        text_format_dict=text_format_dict
+                    )
+                else:
+                    sample = upsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        res_hidden_states_tuple=res_samples,
+                        encoder_hidden_states=encoder_hidden_states,
+                        upsample_size=upsample_size,
+                    )
+            else:
+                if isinstance(upsample_block, CrossAttnUpBlock2D):
+                    upsample_block.attentions
+                    import ipdb;ipdb.set_trace()
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        return UNet2DConditionOutput(sample=sample)

sample.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import json
+import time
+import argparse
+import imageio
+import torch
+import numpy as np
+from torchvision import transforms
+from models.region_diffusion import RegionDiffusion
+from utils.attention_utils import get_token_maps
+from utils.richtext_utils import seed_everything, parse_json, get_region_diffusion_input,\
+    get_attention_control_input, get_gradient_guidance_input
+def main(args, param):
+    # Create the folder to store outputs.
+    run_dir = args.run_dir
+    os.makedirs(args.run_dir, exist_ok=True)
+    # Load region diffusion model.
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = RegionDiffusion(device)
+    # parse json to span attributes
+    base_text_prompt, style_text_prompts, footnote_text_prompts, footnote_target_tokens,\
+        color_text_prompts, color_names, color_rgbs, size_text_prompts_and_sizes, use_grad_guidance = parse_json(
+            param['text_input'])
+    # create control input for region diffusion
+    region_text_prompts, region_target_token_ids, base_tokens = get_region_diffusion_input(
+        model, base_text_prompt, style_text_prompts, footnote_text_prompts,
+        footnote_target_tokens, color_text_prompts, color_names)
+    # create control input for cross attention
+    text_format_dict = get_attention_control_input(
+        model, base_tokens, size_text_prompts_and_sizes)
+    # create control input for region guidance
+    text_format_dict, color_target_token_ids = get_gradient_guidance_input(
+        model, base_tokens, color_text_prompts, color_rgbs, text_format_dict)
+    height = param['height']
+    width = param['width']
+    seed = param['noise_index']
+    negative_text = param['negative_prompt']
+    seed_everything(seed)
+    # get token maps from plain text to image generation.
+    begin_time = time.time()
+    if model.attention_maps is None:
+        model.register_evaluation_hooks()
+    else:
+        model.reset_attention_maps()
+    plain_img = model.produce_attn_maps([base_text_prompt], [negative_text],
+                                        height=height, width=width, num_inference_steps=param['steps'],
+                                        guidance_scale=param['guidance_weight'])
+    fn_base = os.path.join(run_dir, 'seed%d_plain.png' % (seed))
+    imageio.imwrite(fn_base, plain_img[0])
+    print('time lapses to get attention maps: %.4f' % (time.time()-begin_time))
+    color_obj_masks = get_token_maps(
+        model.attention_maps, run_dir, width//8, height//8, color_target_token_ids, seed)
+    model.masks = get_token_maps(
+        model.attention_maps, run_dir, width//8, height//8, region_target_token_ids, seed, base_tokens)
+    color_obj_masks = [transforms.functional.resize(color_obj_mask, (height, width),
+                                                    interpolation=transforms.InterpolationMode.BICUBIC,
+                                                    antialias=True)
+                       for color_obj_mask in color_obj_masks]
+    text_format_dict['color_obj_atten'] = color_obj_masks
+    model.remove_evaluation_hooks()
+    # generate image from rich text
+    begin_time = time.time()
+    seed_everything(seed)
+    rich_img = model.prompt_to_img(region_text_prompts, [negative_text],
+                                   height=height, width=width, num_inference_steps=param['steps'],
+                                   guidance_scale=param['guidance_weight'], use_grad_guidance=use_grad_guidance,
+                                   text_format_dict=text_format_dict)
+    print('time lapses to generate image from rich text: %.4f' %
+          (time.time()-begin_time))
+    fn_style = os.path.join(run_dir, 'seed%d_rich.png' % (seed))
+    imageio.imwrite(fn_style, rich_img[0])
+    # imageio.imwrite(fn_cat, np.concatenate([img[0], rich_img[0]], 1))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--run_dir', type=str, default='results/release/debug')
+    parser.add_argument('--height', type=int, default=512)
+    parser.add_argument('--width', type=int, default=512)
+    parser.add_argument('--seed', type=int, default=6)
+    parser.add_argument('--sample_steps', type=int, default=41)
+    parser.add_argument('--rich_text_json', type=str,
+                        default='{"ops":[{"insert":"A close-up 4k dslr photo of a "},{"attributes":{"link":"A cat wearing sunglasses and a bandana around its neck."},"insert":"cat"},{"insert":" riding a scooter. There are palm trees in the background."}]}')
+    parser.add_argument('--negative_prompt', type=str, default='')
+    parser.add_argument('--guidance_weight', type=float, default=8.5)
+    args = parser.parse_args()
+    param = {
+        'text_input': json.loads(args.rich_text_json),
+        'height': args.height,
+        'width': args.width,
+        'guidance_weight': args.guidance_weight,
+        'steps': args.sample_steps,
+        'noise_index': args.seed,
+        'negative_prompt': args.negative_prompt,
+    }
+    main(args, param)

utils/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

utils/attention_utils.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import numpy as np
+import os
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import seaborn as sns
+import torch
+import torchvision
+from pathlib import Path
+import skimage
+from skimage.morphology import erosion, square
+def split_attention_maps_over_steps(attention_maps):
+    r"""Function for splitting attention maps over steps.
+    Args:
+        attention_maps (dict): Dictionary of attention maps.
+        sampler_order (int): Order of the sampler.
+    """
+    # This function splits attention maps into unconditional and conditional score and over steps
+    attention_maps_cond = dict()    # Maps corresponding to conditional score
+    attention_maps_uncond = dict()  # Maps corresponding to unconditional score
+    for layer in attention_maps.keys():
+        for step_num in range(len(attention_maps[layer])):
+            if step_num not in attention_maps_cond:
+                attention_maps_cond[step_num] = dict()
+                attention_maps_uncond[step_num] = dict()
+            attention_maps_uncond[step_num].update(
+                {layer: attention_maps[layer][step_num][:1]})
+            attention_maps_cond[step_num].update(
+                {layer: attention_maps[layer][step_num][1:2]})
+    return attention_maps_cond, attention_maps_uncond
+def plot_attention_maps(atten_map_list, obj_tokens, save_dir, seed, tokens_vis=None):
+    atten_names = ['presoftmax', 'postsoftmax', 'postsoftmax_erosion']
+    for i, (attn_map, obj_token) in enumerate(zip(atten_map_list, obj_tokens)):
+        n_obj = len(attn_map)
+        plt.figure()
+        plt.clf()
+        fig, axs = plt.subplots(
+            ncols=n_obj+1, gridspec_kw=dict(width_ratios=[1 for _ in range(n_obj)]+[0.1]))
+        fig.set_figheight(3)
+        fig.set_figwidth(3*n_obj+0.1)
+        cmap = plt.get_cmap('OrRd')
+        vmax = 0
+        vmin = 1
+        for tid in range(n_obj):
+            attention_map_cur = attn_map[tid]
+            vmax = max(vmax, float(attention_map_cur.max()))
+            vmin = min(vmin, float(attention_map_cur.min()))
+        for tid in range(n_obj):
+            sns.heatmap(
+                attn_map[tid][0], annot=False, cbar=False, ax=axs[tid],
+                cmap=cmap, vmin=vmin, vmax=vmax
+            )
+            axs[tid].set_axis_off()
+            if tokens_vis is not None:
+                if tid == n_obj-1:
+                    axs_xlabel = 'other tokens'
+                else:
+                    axs_xlabel = ''
+                    for token_id in obj_tokens[tid]:
+                        axs_xlabel += tokens_vis[token_id.item() -
+                                                 1][:-len('</w>')]
+                axs[tid].set_title(axs_xlabel)
+        norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax)
+        sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
+        fig.colorbar(sm, cax=axs[-1])
+        fig.tight_layout()
+        plt.savefig(os.path.join(
+            save_dir, 'token_mapes_seed%d_%s.png' % (seed, atten_names[i])), dpi=100)
+        plt.close('all')
+def get_token_maps(attention_maps, save_dir, width, height, obj_tokens, seed=0, tokens_vis=None,
+                   preprocess=False):
+    r"""Function to visualize attention maps.
+    Args:
+        save_dir (str): Path to save attention maps
+        batch_size (int): Batch size
+        sampler_order (int): Sampler order
+    """
+    # Split attention maps over steps
+    attention_maps_cond, _ = split_attention_maps_over_steps(
+        attention_maps
+    )
+    selected_layers = [
+        # 'down_blocks.0.attentions.0.transformer_blocks.0.attn2',
+        # 'down_blocks.0.attentions.1.transformer_blocks.0.attn2',
+        'down_blocks.1.attentions.0.transformer_blocks.0.attn2',
+        # 'down_blocks.1.attentions.1.transformer_blocks.0.attn2',
+        'down_blocks.2.attentions.0.transformer_blocks.0.attn2',
+        'down_blocks.2.attentions.1.transformer_blocks.0.attn2',
+        'mid_block.attentions.0.transformer_blocks.0.attn2',
+        'up_blocks.1.attentions.0.transformer_blocks.0.attn2',
+        'up_blocks.1.attentions.1.transformer_blocks.0.attn2',
+        'up_blocks.1.attentions.2.transformer_blocks.0.attn2',
+        # 'up_blocks.2.attentions.0.transformer_blocks.0.attn2',
+        'up_blocks.2.attentions.1.transformer_blocks.0.attn2',
+        # 'up_blocks.2.attentions.2.transformer_blocks.0.attn2',
+        # 'up_blocks.3.attentions.0.transformer_blocks.0.attn2',
+        # 'up_blocks.3.attentions.1.transformer_blocks.0.attn2',
+        # 'up_blocks.3.attentions.2.transformer_blocks.0.attn2'
+    ]
+    nsteps = len(attention_maps_cond)
+    hw_ori = width * height
+    attention_maps = []
+    for obj_token in obj_tokens:
+        attention_maps.append([])
+    for step_num in range(nsteps):
+        attention_maps_cur = attention_maps_cond[step_num]
+        for layer in attention_maps_cur.keys():
+            if step_num < 10 or layer not in selected_layers:
+                continue
+            attention_ind = attention_maps_cur[layer].cpu()
+            # Attention maps are of shape [batch_size, nkeys, 77]
+            # since they are averaged out while collecting from hooks to save memory.
+            # Now split the heads from batch dimension
+            bs, hw, nclip = attention_ind.shape
+            down_ratio = np.sqrt(hw_ori // hw)
+            width_cur = int(width // down_ratio)
+            height_cur = int(height // down_ratio)
+            attention_ind = attention_ind.reshape(
+                bs, height_cur, width_cur, nclip)
+            for obj_id, obj_token in enumerate(obj_tokens):
+                if obj_token[0] == -1:
+                    attention_map_prev = torch.stack(
+                        [attention_maps[i][-1] for i in range(obj_id)]).sum(0)
+                    attention_maps[obj_id].append(
+                        attention_map_prev.max()-attention_map_prev)
+                else:
+                    obj_attention_map = attention_ind[:, :, :, obj_token].max(-1, True)[
+                        0].permute([3, 0, 1, 2])
+                    obj_attention_map = torchvision.transforms.functional.resize(obj_attention_map, (height, width),
+                                                                                 interpolation=torchvision.transforms.InterpolationMode.BICUBIC, antialias=True)
+                    attention_maps[obj_id].append(obj_attention_map)
+    # average attention maps over steps
+    attention_maps_averaged = []
+    for obj_id, obj_token in enumerate(obj_tokens):
+        if obj_id == len(obj_tokens) - 1:
+            attention_maps_averaged.append(
+                torch.cat(attention_maps[obj_id]).mean(0))
+        else:
+            attention_maps_averaged.append(
+                torch.cat(attention_maps[obj_id]).mean(0))
+    # normalize attention maps into [0, 1]
+    attention_maps_averaged_normalized = []
+    attention_maps_averaged_sum = torch.cat(attention_maps_averaged).sum(0)
+    for obj_id, obj_token in enumerate(obj_tokens):
+        attention_maps_averaged_normalized.append(
+            attention_maps_averaged[obj_id]/attention_maps_averaged_sum)
+    # softmax
+    attention_maps_averaged_normalized = (
+        torch.cat(attention_maps_averaged)/0.001).softmax(0)
+    attention_maps_averaged_normalized = [
+        attention_maps_averaged_normalized[i:i+1] for i in range(attention_maps_averaged_normalized.shape[0])]
+    if preprocess:
+        # it is possible to preprocess the attention maps here
+        selem = square(5)
+        attention_maps_averaged_eroded = [erosion(skimage.img_as_float(
+            map[0].numpy()*255), selem) for map in attention_maps_averaged_normalized[:2]]
+        attention_maps_averaged_eroded = [(torch.from_numpy(map).unsqueeze(
+            0)/255. > 0.8).float() for map in attention_maps_averaged_eroded]
+        attention_maps_averaged_eroded.append(
+            1 - torch.cat(attention_maps_averaged_eroded).sum(0, True))
+        plot_attention_maps([attention_maps_averaged, attention_maps_averaged_normalized,
+                            attention_maps_averaged_eroded], obj_tokens, save_dir, seed, tokens_vis)
+        attention_maps_averaged_eroded = [attn_mask.unsqueeze(1).repeat(
+            [1, 4, 1, 1]).cuda() for attn_mask in attention_maps_averaged_eroded]
+        return attention_maps_averaged_eroded
+    else:
+        plot_attention_maps([attention_maps_averaged, attention_maps_averaged_normalized],
+                            obj_tokens, save_dir, seed, tokens_vis)
+        attention_maps_averaged_normalized = [attn_mask.unsqueeze(1).repeat(
+            [1, 4, 1, 1]).cuda() for attn_mask in attention_maps_averaged_normalized]
+        return attention_maps_averaged_normalized

utils/richtext_utils.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import os
+import json
+import torch
+import random
+import numpy as np
+COLORS = {
+    'brown': [165, 42, 42],
+    'red': [255, 0, 0],
+    'pink': [253, 108, 158],
+    'orange': [255, 165, 0],
+    'yellow': [255, 255, 0],
+    'purple': [128, 0, 128],
+    'green': [0, 128, 0],
+    'blue': [0, 0, 255],
+    'white': [255, 255, 255],
+    'gray': [128, 128, 128],
+    'black': [0, 0, 0],
+}
+def seed_everything(seed):
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+def hex_to_rgb(hex_string, return_nearest_color=False):
+    r"""
+    Covert Hex triplet to RGB triplet.
+    """
+    # Remove '#' symbol if present
+    hex_string = hex_string.lstrip('#')
+    # Convert hex values to integers
+    red = int(hex_string[0:2], 16)
+    green = int(hex_string[2:4], 16)
+    blue = int(hex_string[4:6], 16)
+    rgb = torch.FloatTensor((red, green, blue))[None, :, None, None]/255.
+    if return_nearest_color:
+        nearest_color = find_nearest_color(rgb)
+        return rgb.cuda(), nearest_color
+    return rgb.cuda()
+def find_nearest_color(rgb):
+    r"""
+    Find the nearest neighbor color given the RGB value.
+    """
+    if isinstance(rgb, list) or isinstance(rgb, tuple):
+        rgb = torch.FloatTensor(rgb)[None, :, None, None]/255.
+    color_distance = torch.FloatTensor([np.linalg.norm(
+        rgb - torch.FloatTensor(COLORS[color])[None, :, None, None]/255.) for color in COLORS.keys()])
+    nearest_color = list(COLORS.keys())[torch.argmin(color_distance).item()]
+    return nearest_color
+def font2style(font):
+    r"""
+    Convert the font name to the style name.
+    """
+    return {'mirza': 'Claud Monet, impressionism, oil on canvas',
+            'roboto': 'Ukiyoe',
+            'cursive': 'Cyber Punk, futuristic, blade runner, william gibson, trending on artstation hq',
+            'sofia': 'Pop Art, masterpiece, andy warhol',
+            'slabo': 'Vincent Van Gogh',
+            'inconsolata': 'Pixel Art, 8 bits, 16 bits',
+            'ubuntu': 'Rembrandt',
+            'Monoton': 'neon art, colorful light, highly details, octane render',
+            'Akronim': 'Abstract Cubism, Pablo Picasso', }[font]
+def parse_json(json_str):
+    r"""
+    Convert the JSON string to attributes.
+    """
+    # initialze region-base attributes.
+    base_text_prompt = ''
+    style_text_prompts = []
+    footnote_text_prompts = []
+    footnote_target_tokens = []
+    color_text_prompts = []
+    color_rgbs = []
+    color_names = []
+    size_text_prompts_and_sizes = []
+    # parse the attributes from JSON.
+    prev_style = None
+    prev_color_rgb = None
+    use_grad_guidance = False
+    for span in json_str['ops']:
+        text_prompt = span['insert'].rstrip('\n')
+        base_text_prompt += span['insert'].rstrip('\n')
+        if text_prompt == ' ':
+            continue
+        if 'attributes' in span:
+            if 'font' in span['attributes']:
+                style = font2style(span['attributes']['font'])
+                if prev_style == style:
+                    prev_text_prompt = style_text_prompts[-1].split('in the style of')[
+                        0]
+                    style_text_prompts[-1] = prev_text_prompt + \
+                        ' ' + text_prompt + f' in the style of {style}'
+                else:
+                    style_text_prompts.append(
+                        text_prompt + f' in the style of {style}')
+                prev_style = style
+            else:
+                prev_style = None
+            if 'link' in span['attributes']:
+                footnote_text_prompts.append(span['attributes']['link'])
+                footnote_target_tokens.append(text_prompt)
+            font_size = 1
+            if 'size' in span['attributes'] and 'strike' not in span['attributes']:
+                font_size = float(span['attributes']['size'][:-2])/3.
+            elif 'size' in span['attributes'] and 'strike' in span['attributes']:
+                font_size = -float(span['attributes']['size'][:-2])/3.
+            elif 'size' not in span['attributes'] and 'strike' not in span['attributes']:
+                font_size = 1
+            if 'color' in span['attributes']:
+                use_grad_guidance = True
+                color_rgb, nearest_color = hex_to_rgb(
+                    span['attributes']['color'], True)
+                if prev_color_rgb == color_rgb:
+                    prev_text_prompt = color_text_prompts[-1]
+                    color_text_prompts[-1] = prev_text_prompt + \
+                        ' ' + text_prompt
+                else:
+                    color_rgbs.append(color_rgb)
+                    color_names.append(nearest_color)
+                    color_text_prompts.append(text_prompt)
+            if font_size != 1:
+                size_text_prompts_and_sizes.append([text_prompt, font_size])
+    return base_text_prompt, style_text_prompts, footnote_text_prompts, footnote_target_tokens,\
+        color_text_prompts, color_names, color_rgbs, size_text_prompts_and_sizes, use_grad_guidance
+def get_region_diffusion_input(model, base_text_prompt, style_text_prompts, footnote_text_prompts,
+                               footnote_target_tokens, color_text_prompts, color_names):
+    r"""
+    Algorithm 1 in the paper.
+    """
+    region_text_prompts = []
+    region_target_token_ids = []
+    base_tokens = model.tokenizer._tokenize(base_text_prompt)
+    # process the style text prompt
+    for text_prompt in style_text_prompts:
+        region_text_prompts.append(text_prompt)
+        region_target_token_ids.append([])
+        style_tokens = model.tokenizer._tokenize(
+            text_prompt.split('in the style of')[0])
+        for style_token in style_tokens:
+            region_target_token_ids[-1].append(
+                base_tokens.index(style_token)+1)
+    # process the complementary text prompt
+    for footnote_text_prompt, text_prompt in zip(footnote_text_prompts, footnote_target_tokens):
+        region_target_token_ids.append([])
+        region_text_prompts.append(footnote_text_prompt)
+        style_tokens = model.tokenizer._tokenize(text_prompt)
+        for style_token in style_tokens:
+            region_target_token_ids[-1].append(
+                base_tokens.index(style_token)+1)
+    # process the color text prompt
+    for color_text_prompt, color_name in zip(color_text_prompts, color_names):
+        region_target_token_ids.append([])
+        region_text_prompts.append(color_name+' '+color_text_prompt)
+        style_tokens = model.tokenizer._tokenize(color_text_prompt)
+        for style_token in style_tokens:
+            region_target_token_ids[-1].append(
+                base_tokens.index(style_token)+1)
+    # process the remaining tokens without any attributes
+    region_text_prompts.append(base_text_prompt)
+    region_target_token_ids_all = [
+        id for ids in region_target_token_ids for id in ids]
+    target_token_ids_rest = [id for id in range(
+        1, len(base_tokens)+1) if id not in region_target_token_ids_all]
+    region_target_token_ids.append(target_token_ids_rest)
+    region_target_token_ids = [torch.LongTensor(
+        obj_token_id) for obj_token_id in region_target_token_ids]
+    return region_text_prompts, region_target_token_ids, base_tokens
+def get_attention_control_input(model, base_tokens, size_text_prompts_and_sizes):
+    r"""
+    Control the token impact using font sizes.
+    """
+    word_pos = []
+    font_sizes = []
+    for text_prompt, font_size in size_text_prompts_and_sizes:
+        size_tokens = model.tokenizer._tokenize(text_prompt)
+        for size_token in size_tokens:
+            word_pos.append(base_tokens.index(size_token)+1)
+            font_sizes.append(font_size)
+    if len(word_pos) > 0:
+        word_pos = torch.LongTensor(word_pos).cuda()
+        font_sizes = torch.FloatTensor(font_sizes).cuda()
+    else:
+        word_pos = None
+        font_sizes = None
+    text_format_dict = {
+        'word_pos': word_pos,
+        'font_size': font_sizes,
+    }
+    return text_format_dict
+def get_gradient_guidance_input(model, base_tokens, color_text_prompts, color_rgbs, text_format_dict,
+                                guidance_start_step=999, color_guidance_weight=1):
+    r"""
+    Control the token impact using font sizes.
+    """
+    color_target_token_ids = []
+    for text_prompt in color_text_prompts:
+        color_target_token_ids.append([])
+        color_tokens = model.tokenizer._tokenize(text_prompt)
+        for color_token in color_tokens:
+            color_target_token_ids[-1].append(base_tokens.index(color_token)+1)
+    color_target_token_ids_all = [
+        id for ids in color_target_token_ids for id in ids]
+    color_target_token_ids_rest = [id for id in range(
+        1, len(base_tokens)+1) if id not in color_target_token_ids_all]
+    color_target_token_ids.append(color_target_token_ids_rest)
+    color_target_token_ids = [torch.LongTensor(
+        obj_token_id) for obj_token_id in color_target_token_ids]
+    text_format_dict['target_RGB'] = color_rgbs
+    text_format_dict['guidance_start_step'] = guidance_start_step
+    text_format_dict['color_guidance_weight'] = color_guidance_weight
+    return text_format_dict, color_target_token_ids