Spaces:

longlian
/

llm-grounded-diffusion

Running on T4

App Files Files Community

Tony Lian commited on Jun 12, 2023

Commit

1f39cf9

1 Parent(s): 58524a7

Add stage 2

Browse files

Files changed (21) hide show

.gitignore +160 -0
README.md +4 -2
app.py +105 -17
baseline.py +41 -0
generation.py +179 -0
models/__init__.py +1 -0
models/attention.py +392 -0
models/attention_processor.py +508 -0
models/modeling_utils.py +874 -0
models/models.py +97 -0
models/pipelines.py +246 -0
models/sam.py +179 -0
models/transformer_2d.py +367 -0
models/unet_2d_blocks.py +793 -0
models/unet_2d_condition.py +980 -0
requirements.txt +11 -0
shared.py +11 -0
utils/__init__.py +1 -0
utils/latents.py +151 -0
utils/parse.py +284 -0
utils/utils.py +165 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

README.md CHANGED Viewed

@@ -4,10 +4,12 @@ emoji: 😊
 colorFrom: red
 colorTo: pink
 sdk: gradio
-sdk_version: 3.32.0
 app_file: app.py
 pinned: true
 tags: [llm, diffusion, grounding, grounded, llm-grounded, text-to-image, language, large language models, layout, generation, generative, customization, personalization, prompting, chatgpt, gpt-3.5, gpt-4]
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorFrom: red
 colorTo: pink
 sdk: gradio
+sdk_version: 3.34.0
 app_file: app.py
 pinned: true
 tags: [llm, diffusion, grounding, grounded, llm-grounded, text-to-image, language, large language models, layout, generation, generative, customization, personalization, prompting, chatgpt, gpt-3.5, gpt-4]
 ---
+Credits:
+This space uses code from [diffusers](https://huggingface.co/docs/diffusers/index), [GLIGEN](https://github.com/gligen/GLIGEN), and [layout-guidance](https://github.com/silent-chen/layout-guidance). Using their code means adhering to their license.

app.py CHANGED Viewed

@@ -4,13 +4,21 @@ import ast
 from matplotlib.patches import Polygon
 from matplotlib.collections import PatchCollection
 import matplotlib.pyplot as plt
 box_scale = (512, 512)
 size = box_scale
 bg_prompt_text = "Background prompt: "
-simplified_prompt = """You are an intelligent bounding box generator. I will provide you with a caption for a photo, image, or painting. Your task is to generate the bounding boxes for the objects mentioned in the caption, along with a background prompt describing the scene. The images are of size 512x512, and the bounding boxes should not overlap or go beyond the image boundaries. Each bounding box should be in the format of (object name, [top-left x coordinate, top-left y coordinate, box width, box height]) and include exactly one object. Do not put objects that are already provided in the bounding boxes into the background prompt. If needed, you can make reasonable guesses. Please refer to the example below for the desired format.
 Caption: A realistic image of landscape scene depicting a green car parking on the left of a blue truck, with a red air balloon and a bird in the sky
 Objects: [('a green car', [21, 181, 211, 159]), ('a blue truck', [269, 181, 209, 160]), ('a red air balloon', [66, 8, 145, 135]), ('a bird', [296, 42, 143, 100])]
@@ -43,12 +51,20 @@ Background prompt: An oil painting of a living room scene
 Caption: {prompt}
 Objects: """
 def get_lmd_prompt(prompt):
     if prompt == "":
-        prompt = "A realistic photo of a gray cat and an orange dog on the grass."
     return simplified_prompt.format(prompt=prompt)
 def get_layout_image(response):
     gen_boxes, bg_prompt = parse_input(response)
     fig = plt.figure(figsize=(8, 8))
     # https://stackoverflow.com/questions/7821518/save-plot-to-numpy-array
@@ -63,6 +79,35 @@ def get_layout_image(response):
     plt.clf()
     return data
 def parse_input(text=None):
     try:
         if "Objects: " in text:
@@ -130,30 +175,73 @@ def show_boxes(gen_boxes, bg_prompt=None):
     draw_boxes(anns)
-with gr.Blocks() as g:
-    gr.HTML("""<h1>LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models</h1>
-            <p>This is a space that allows you to explore the layouts generated by ChatGPT on your own with a simplified set of examples. The layout-to-image generation part will be added.</p>
-            <p>Read our <a href='https://llm-grounded-diffusion.github.io/'>a brief introduction on our project page</a> or <a href='https://arxiv.org/pdf/2305.13655.pdf'>our work on arxiv</a>. <a href='https://llm-grounded-diffusion.github.io/#citation'>Cite our work</a> if our ideas inspire you.</p>
             <p><b>Tips:</b><p>
             <p>1. If ChatGPT doesn't generate layout, add/remove the trailing space (added by default) and/or use GPT-4.</p>
             <p>2. You can perform multi-round specification by giving ChatGPT follow-up requests (e.g., make the object boxes bigger).</p>
-            <p>3. You can also try prompts in Simplified Chinese. If you want to try prompts in another language, translate the first line of last example to your language.<p>""")
-    with gr.Tab("Image Prompt to ChatGPT"):
         with gr.Row():
             with gr.Column(scale=1):
-                prompt = gr.Textbox(lines=2, label="Prompt for Layout Generation", placeholder="A realistic photo of a gray cat and an orange dog on the grass.")
-                greet_btn = gr.Button("Generate Prompt")
             with gr.Column(scale=1):
-                output = gr.Textbox(label="Paste this into ChatGPT (GPT-4 usually gives better results)")
-        greet_btn.click(fn=get_lmd_prompt, inputs=prompt, outputs=output, api_name="get_lmd_prompt")
-    with gr.Tab("Visualize ChatGPT-generated Layout"):
         with gr.Row():
             with gr.Column(scale=1):
-                prompt = gr.Textbox(lines=2, label="Paste ChatGPT response here", placeholder="Paste ChatGPT response here")
-                greet_btn = gr.Button("Visualize Layout")
             with gr.Column(scale=1):
-                output = gr.Image(shape=(512, 512), elem_classes="img", elem_id="img", css="img {width: 300px}")
-        greet_btn.click(fn=get_layout_image, inputs=prompt, outputs=output, api_name="chatgpt-to-layout")
 g.launch()

 from matplotlib.patches import Polygon
 from matplotlib.collections import PatchCollection
 import matplotlib.pyplot as plt
+from utils.parse import filter_boxes
+from generation import run as run_ours
+from baseline import run as run_baseline
+import torch
+print(f"Is CUDA available: {torch.cuda.is_available()}")
+if torch.cuda.is_available():
+    print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
 box_scale = (512, 512)
 size = box_scale
 bg_prompt_text = "Background prompt: "
+simplified_prompt = """You are an intelligent bounding box generator. I will provide you with a caption for a photo, image, or painting. Your task is to generate the bounding boxes for the objects mentioned in the caption, along with a background prompt describing the scene. The images are of size 512x512, and the bounding boxes should not overlap or go beyond the image boundaries. Each bounding box should be in the format of (object name, [top-left x coordinate, top-left y coordinate, box width, box height]) and include exactly one object. Do not put objects that are already provided in the bounding boxes into the background prompt. If needed, you can make reasonable guesses. Generate the object descriptions and background prompts in English even if the caption might not be in English. Please refer to the example below for the desired format.
 Caption: A realistic image of landscape scene depicting a green car parking on the left of a blue truck, with a red air balloon and a bird in the sky
 Objects: [('a green car', [21, 181, 211, 159]), ('a blue truck', [269, 181, 209, 160]), ('a red air balloon', [66, 8, 145, 135]), ('a bird', [296, 42, 143, 100])]
 Caption: {prompt}
 Objects: """
+prompt_placeholder = "A realistic photo of a gray cat and an orange dog on the grass."
+layout_placeholder = """Caption: A realistic photo of a gray cat and an orange dog on the grass.
+Objects: [('a gray cat', [67, 243, 120, 126]), ('an orange dog', [265, 193, 190, 210])]
+Background prompt: A realistic photo of a grassy area."""
 def get_lmd_prompt(prompt):
     if prompt == "":
+        prompt = prompt_placeholder
     return simplified_prompt.format(prompt=prompt)
 def get_layout_image(response):
+    if response == "":
+        response = layout_placeholder
     gen_boxes, bg_prompt = parse_input(response)
     fig = plt.figure(figsize=(8, 8))
     # https://stackoverflow.com/questions/7821518/save-plot-to-numpy-array
     plt.clf()
     return data
+def get_layout_image_gallery(response):
+    return [get_layout_image(response)]
+def get_ours_image(response, seed, fg_seed_start, fg_blending_ratio=0.1, frozen_step_ratio=0.4, gligen_scheduled_sampling_beta=0.3, show_so_imgs=False, scale_boxes=False, gallery=None):
+    if response == "":
+        response = layout_placeholder
+    gen_boxes, bg_prompt = parse_input(response)
+    gen_boxes = filter_boxes(gen_boxes, scale_boxes=scale_boxes)
+    spec = {
+        # prompt is unused
+        'prompt': '',
+        'gen_boxes': gen_boxes,
+        'bg_prompt': bg_prompt
+    }
+    image_np, so_img_list = run_ours(
+        spec, bg_seed=seed, fg_seed_start=fg_seed_start,
+        fg_blending_ratio=fg_blending_ratio,frozen_step_ratio=frozen_step_ratio,
+        gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta)
+    images = [image_np]
+    if show_so_imgs:
+        images.extend([np.asarray(so_img) for so_img in so_img_list])
+    return images
+def get_baseline_image(prompt, seed):
+    if prompt == "":
+        prompt = prompt_placeholder
+    image_np = run_baseline(prompt, bg_seed=seed)
+    return [image_np]
 def parse_input(text=None):
     try:
         if "Objects: " in text:
     draw_boxes(anns)
+duplicate_html = '<a style="display:inline-block" href="https://huggingface.co/spaces/longlian/llm-grounded-diffusion?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a>'
+with gr.Blocks(
+    title="LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models"
+) as g:
+    gr.HTML(f"""<h1>LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models</h1>
+            <h2>LLM + Stable Diffusion => better prompt understanding in text2image generation 🤩</h2>
+            <h2><a href='https://llm-grounded-diffusion.github.io/'>Project Page</a> | <a href='https://bair.berkeley.edu/blog/2023/05/23/lmd/'>5-minute Blog Post</a> | <a href='https://arxiv.org/pdf/2305.13655.pdf'>ArXiv Paper</a> | <a href='https://github.com/TonyLianLong/LLM-groundedDiffusion'>Github</a> | <a href='https://llm-grounded-diffusion.github.io/#citation'>Cite our work</a> if our ideas inspire you.</h2>
             <p><b>Tips:</b><p>
             <p>1. If ChatGPT doesn't generate layout, add/remove the trailing space (added by default) and/or use GPT-4.</p>
             <p>2. You can perform multi-round specification by giving ChatGPT follow-up requests (e.g., make the object boxes bigger).</p>
+            <p>3. You can also try prompts in Simplified Chinese. If you want to try prompts in another language, translate the first line of last example to your language.<p>
+            <p>4. Duplicate this space and add GPU to skip the queue and run our model faster. {duplicate_html}</p>
+            <br/>
+            <p>Implementation note: In this demo, we replace the attention manipulation in our layout-guided Stable Diffusion described in our paper with GLIGEN due to much faster inference speed (<b>FlashAttention supported, no backprop needed</b> during inference). Compared to vanilla GLIGEN, we have better coherence. Other parts of text-to-image pipeline, including single object generation and SAM, remain the same. The settings and examples in the prompt are simplified in this demo.</p>""")
+    with gr.Tab("Stage 1. Image Prompt to ChatGPT"):
         with gr.Row():
             with gr.Column(scale=1):
+                prompt = gr.Textbox(lines=2, label="Prompt for Layout Generation", placeholder=prompt_placeholder)
+                generate_btn = gr.Button("Generate Prompt")
             with gr.Column(scale=1):
+                output = gr.Textbox(label="Paste this into ChatGPT (GPT-4 preferred; on Mac, click text and press Command+A and Command+C to copy all)")
+        generate_btn.click(fn=get_lmd_prompt, inputs=prompt, outputs=output, api_name="get_lmd_prompt")
+    # with gr.Tab("(Optional) Visualize ChatGPT-generated Layout"):
+    #     with gr.Row():
+    #         with gr.Column(scale=1):
+    #             response = gr.Textbox(lines=5, label="Paste ChatGPT response here", placeholder=layout_placeholder)
+    #             visualize_btn = gr.Button("Visualize Layout")
+    #         with gr.Column(scale=1):
+    #             output = gr.Image(shape=(512, 512), elem_classes="img", elem_id="img", css="img {width: 300px}")
+    #     visualize_btn.click(fn=get_layout_image, inputs=response, outputs=output, api_name="visualize-layout")
+    with gr.Tab("Stage 2 (New). Layout to Image generation"):
         with gr.Row():
             with gr.Column(scale=1):
+                response = gr.Textbox(lines=5, label="Paste ChatGPT response here (no original caption needed)", placeholder=layout_placeholder)
+                visualize_btn = gr.Button("Visualize Layout")
+                generate_btn = gr.Button("Generate Image from Layout", variant='primary')
+                with gr.Accordion("Advanced options", open=False):
+                    seed = gr.Slider(0, 10000, value=0, step=1, label="Seed")
+                    fg_seed_start = gr.Slider(0, 10000, value=20, step=1, label="Seed for foreground variation")
+                    fg_blending_ratio = gr.Slider(0, 1, value=0.1, step=0.01, label="Variations added to foreground for single object generation (0: no variation, 1: max variation)")
+                    frozen_step_ratio = gr.Slider(0, 1, value=0.4, step=0.1, label="Foreground frozen steps ratio (higher: preserve object attributes; lower: higher coherence; set to 0: (almost) equivalent to vanilla GLIGEN except details)")
+                    gligen_scheduled_sampling_beta = gr.Slider(0, 1, value=0.3, step=0.1, label="GLIGEN guidance steps ratio (the beta value)")
+                    show_so_imgs = gr.Checkbox(label="Show annotated single object generations", show_label=False)
             with gr.Column(scale=1):
+                gallery = gr.Gallery(
+                    label="Generated image", show_label=False, elem_id="gallery"
+                ).style(columns=[1], rows=[1], object_fit="contain", preview=True)
+        visualize_btn.click(fn=get_layout_image_gallery, inputs=response, outputs=gallery, api_name="visualize-layout")
+        generate_btn.click(fn=get_ours_image, inputs=[response, seed, fg_seed_start, fg_blending_ratio, frozen_step_ratio, gligen_scheduled_sampling_beta, show_so_imgs], outputs=gallery, api_name="layout-to-image")
+    with gr.Tab("Baseline: Stable Diffusion"):
+        with gr.Row():
+            with gr.Column(scale=1):
+                sd_prompt = gr.Textbox(lines=2, label="Prompt for baseline SD", placeholder=prompt_placeholder)
+                generate_btn = gr.Button("Generate")
+                with gr.Accordion("Advanced options", open=False):
+                    seed = gr.Slider(0, 10000, value=0, step=1, label="Seed")
+            # with gr.Column(scale=1):
+            #     output = gr.Image(shape=(512, 512), elem_classes="img", elem_id="img")
+            with gr.Column(scale=1):
+                gallery = gr.Gallery(
+                    label="Generated image", show_label=False, elem_id="gallery2"
+                ).style(columns=[1], rows=[1], object_fit="contain", preview=True)
+        generate_btn.click(fn=get_baseline_image, inputs=[sd_prompt, seed], outputs=gallery, api_name="baseline")
 g.launch()

baseline.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Original Stable Diffusion (1.4)
+import torch
+import models
+from models import pipelines
+from shared import model_dict
+vae, tokenizer, text_encoder, unet, scheduler, dtype = model_dict.vae, model_dict.tokenizer, model_dict.text_encoder, model_dict.unet, model_dict.scheduler, model_dict.dtype
+torch.set_grad_enabled(False)
+height = 512  # default height of Stable Diffusion
+width = 512  # default width of Stable Diffusion
+num_inference_steps = 20  # Number of denoising steps
+guidance_scale = 7.5  # Scale for classifier-free guidance
+batch_size = 1
+# h, w
+image_scale = (512, 512)
+bg_negative = 'artifacts, blurry, smooth texture, bad quality, distortions, unrealistic, distorted image, bad proportions, duplicate'
+def run(prompt, bg_seed=1):
+    print(f"prompt: {prompt}")
+    generator = torch.Generator(models.torch_device).manual_seed(bg_seed)
+    prompts = [prompt]
+    input_embeddings = models.encode_prompts(prompts=prompts, tokenizer=tokenizer, text_encoder=text_encoder, negative_prompt=bg_negative)
+    generator = torch.manual_seed(1)  # Seed generator to create the inital latent noise
+    latents = models.get_unscaled_latents(batch_size, unet.config.in_channels, height, width, generator, dtype)
+    latents = latents * scheduler.init_noise_sigma
+    pipelines.gligen_enable_fuser(model_dict['unet'], enabled=False)
+    _, images = pipelines.generate(
+        model_dict, latents, input_embeddings, num_inference_steps,
+        guidance_scale=guidance_scale
+    )
+    return images[0]

generation.py ADDED Viewed

	@@ -0,0 +1,179 @@

+version = "v3.0"
+from PIL import Image
+import torch
+import models
+from models import load_sd
+import utils
+from models import pipelines, sam
+from utils import parse, latents
+from shared import model_dict, sam_model_dict
+verbose = False
+vae, tokenizer, text_encoder, unet, scheduler, dtype = model_dict.vae, model_dict.tokenizer, model_dict.text_encoder, model_dict.unet, model_dict.scheduler, model_dict.dtype
+model_dict.update(sam_model_dict)
+# Hyperparams
+height = 512  # default height of Stable Diffusion
+width = 512  # default width of Stable Diffusion
+H, W = height // 8, width // 8 # size of the latent
+num_inference_steps = 20  # Number of denoising steps
+guidance_scale = 7.5  # Scale for classifier-free guidance
+# batch size that is not 1 is not supported
+so_batch_size = 1
+overall_batch_size = 1
+# discourage masks with confidence below
+discourage_mask_below_confidence = 0.85
+# discourage masks with iou (with coarse binarized attention mask) below
+discourage_mask_below_coarse_iou = 0.25
+run_ind = None
+def generate_single_object_with_box(prompt, box, phrase, word, input_latents, input_embeddings,
+                                    sam_refine_kwargs, gligen_scheduled_sampling_beta=0.3,
+                                    verbose=False, visualize=True):
+    bboxes, phrases, words = [box], [phrase], [word]
+    latents, single_object_images, single_object_pil_images_box_ann, latents_all = pipelines.generate_gligen(
+        model_dict, input_latents, input_embeddings, num_inference_steps, bboxes, phrases, gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
+        guidance_scale=guidance_scale, return_saved_cross_attn=False,
+        return_box_vis=True, save_all_latents=True
+    )
+    mask_selected, conf_score_selected = sam.sam_refine_box(sam_input_image=single_object_images[0], box=box, model_dict=model_dict, verbose=verbose, **sam_refine_kwargs)
+    mask_selected_tensor = torch.tensor(mask_selected)
+    return latents_all, mask_selected_tensor, single_object_pil_images_box_ann[0]
+def get_masked_latents_all_list(so_prompt_phrase_word_box_list, input_latents_list, so_input_embeddings, verbose=False, **kwargs):
+    latents_all_list, mask_tensor_list, so_img_list = [], [], []
+    if not so_prompt_phrase_word_box_list:
+        return latents_all_list, mask_tensor_list
+    so_uncond_embeddings, so_cond_embeddings = so_input_embeddings
+    for idx, ((prompt, phrase, word, box), input_latents) in enumerate(zip(so_prompt_phrase_word_box_list, input_latents_list)):
+        so_current_cond_embeddings = so_cond_embeddings[idx:idx+1]
+        so_current_text_embeddings = torch.cat([so_uncond_embeddings, so_current_cond_embeddings], dim=0)
+        so_current_input_embeddings = so_current_text_embeddings, so_uncond_embeddings, so_current_cond_embeddings
+        latents_all, mask_tensor, so_img = generate_single_object_with_box(prompt, box, phrase, word, input_latents, input_embeddings=so_current_input_embeddings, verbose=verbose, **kwargs)
+        latents_all_list.append(latents_all)
+        mask_tensor_list.append(mask_tensor)
+        so_img_list.append(so_img)
+    return latents_all_list, mask_tensor_list, so_img_list
+# Note: need to keep the supervision, especially the box corrdinates, corresponds to each other in single object and overall.
+def run(
+    spec, bg_seed = 1, fg_seed_start = 20, frozen_step_ratio=0.4, gligen_scheduled_sampling_beta = 0.3,
+    so_center_box = False, fg_blending_ratio = 0.1, so_horizontal_center_only = True,
+    align_with_overall_bboxes = False, horizontal_shift_only = True
+):
+    """
+    so_center_box: using centered box in single object generation
+    so_horizontal_center_only: move to the center horizontally only
+    align_with_overall_bboxes: Align the center of the mask, latents, and cross-attention with the center of the box in overall bboxes
+    horizontal_shift_only: only shift horizontally for the alignment of mask, latents, and cross-attention
+    """
+    print("generation:", spec, bg_seed, fg_seed_start, frozen_step_ratio, gligen_scheduled_sampling_beta)
+    frozen_step_ratio = min(max(frozen_step_ratio, 0.), 1.)
+    frozen_steps = int(num_inference_steps * frozen_step_ratio)
+    if True:
+        so_prompt_phrase_word_box_list, overall_prompt, overall_phrases_words_bboxes = parse.convert_spec(spec, height, width, verbose=verbose)
+    overall_phrases, overall_words, overall_bboxes = [item[0] for item in overall_phrases_words_bboxes], [item[1] for item in overall_phrases_words_bboxes], [item[2] for item in overall_phrases_words_bboxes]
+    # The so box is centered but the overall boxes are not (since we need to place to the right place).
+    if so_center_box:
+        so_prompt_phrase_word_box_list = [(prompt, phrase, word, utils.get_centered_box(bbox, horizontal_center_only=so_horizontal_center_only)) for prompt, phrase, word, bbox in so_prompt_phrase_word_box_list]
+        if verbose:
+            print(f"centered so_prompt_phrase_word_box_list: {so_prompt_phrase_word_box_list}")
+    so_boxes = [item[-1] for item in so_prompt_phrase_word_box_list]
+    if True:
+        so_negative_prompt = "artifacts, blurry, smooth texture, bad quality, distortions, unrealistic, distorted image, bad proportions, duplicate, two, many, group, occlusion, occluded, side, border, collate"
+        overall_negative_prompt = "artifacts, blurry, smooth texture, bad quality, distortions, unrealistic, distorted image, bad proportions, duplicate"
+    else:
+        so_negative_prompt = ""
+        overall_negative_prompt = ""
+    sam_refine_kwargs = dict(
+        discourage_mask_below_confidence=discourage_mask_below_confidence, discourage_mask_below_coarse_iou=discourage_mask_below_coarse_iou,
+        height=height, width=width, H=H, W=W
+    )
+    # Note that so and overall use different negative prompts
+    so_prompts = [item[0] for item in so_prompt_phrase_word_box_list]
+    if so_prompts:
+        so_input_embeddings = models.encode_prompts(prompts=so_prompts, tokenizer=tokenizer, text_encoder=text_encoder, negative_prompt=so_negative_prompt, one_uncond_input_only=True)
+    else:
+        so_input_embeddings = []
+    overall_input_embeddings = models.encode_prompts(prompts=[overall_prompt], tokenizer=tokenizer, negative_prompt=overall_negative_prompt, text_encoder=text_encoder)
+    input_latents_list, latents_bg = latents.get_input_latents_list(
+        model_dict, bg_seed=bg_seed, fg_seed_start=fg_seed_start,
+        so_boxes=so_boxes, fg_blending_ratio=fg_blending_ratio, height=height, width=width, verbose=False
+    )
+    latents_all_list, mask_tensor_list, so_img_list = get_masked_latents_all_list(
+        so_prompt_phrase_word_box_list, input_latents_list,
+        gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
+        sam_refine_kwargs=sam_refine_kwargs, so_input_embeddings=so_input_embeddings, verbose=verbose
+    )
+    composed_latents, foreground_indices, offset_list = latents.compose_latents_with_alignment(
+        model_dict, latents_all_list, mask_tensor_list, num_inference_steps,
+        overall_batch_size, height, width, latents_bg=latents_bg,
+        align_with_overall_bboxes=align_with_overall_bboxes, overall_bboxes=overall_bboxes,
+        horizontal_shift_only=horizontal_shift_only
+    )
+    overall_bboxes_flattened, overall_phrases_flattened = [], []
+    for overall_bboxes_item, overall_phrase in zip(overall_bboxes, overall_phrases):
+        for overall_bbox in overall_bboxes_item:
+            overall_bboxes_flattened.append(overall_bbox)
+            overall_phrases_flattened.append(overall_phrase)
+    # Generate with composed latents
+    # Foreground should be frozen
+    frozen_mask = foreground_indices != 0
+    regen_latents, images = pipelines.generate_gligen(
+        model_dict, composed_latents, overall_input_embeddings, num_inference_steps,
+        overall_bboxes_flattened, overall_phrases_flattened, guidance_scale=guidance_scale,
+        gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
+        frozen_steps=frozen_steps, frozen_mask=frozen_mask
+    )
+    print(f"Generation with spatial guidance from input latents and first {frozen_steps} steps frozen (directly from the composed latents input)")
+    print("Generation from composed latents (with semantic guidance)")
+    # display(Image.fromarray(images[0]), "img", run_ind)
+    return images[0], so_img_list

models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .models import *

models/attention.py ADDED Viewed

	@@ -0,0 +1,392 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.utils import maybe_allow_in_graph
+from .attention_processor import Attention
+from diffusers.models.embeddings import CombinedTimestepLabelEmbeddings
+# https://github.com/gligen/diffusers/blob/23a9a0fab1b48752c7b9bcc98f6fe3b1d8fa7990/src/diffusers/models/attention.py
+class GatedSelfAttentionDense(nn.Module):
+    def __init__(self, query_dim, context_dim, n_heads, d_head):
+        super().__init__()
+        # we need a linear projection since we need cat visual feature and obj feature
+        self.linear = nn.Linear(context_dim, query_dim)
+        self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
+        self.ff = FeedForward(query_dim, activation_fn="geglu")
+        self.norm1 = nn.LayerNorm(query_dim)
+        self.norm2 = nn.LayerNorm(query_dim)
+        self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
+        self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
+        self.enabled = True
+    def forward(self, x, objs, fuser_attn_kwargs={}):
+        if not self.enabled:
+            return x
+        n_visual = x.shape[1]
+        objs = self.linear(objs)
+        x = x + self.alpha_attn.tanh() * self.attn(self.norm1(torch.cat([x, objs], dim=1)), **fuser_attn_kwargs)[:, :n_visual, :]
+        x = x + self.alpha_dense.tanh() * self.ff(self.norm2(x))
+        return x
+@maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        final_dropout: bool = False,
+        use_gated_attention: bool = False,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_zero:
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+        )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = (
+                AdaLayerNorm(dim, num_embeds_ada_norm)
+                if self.use_ada_layer_norm
+                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+            )
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+        # 4. Fuser
+        if use_gated_attention:
+            self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        return_cross_attention_probs: bool = None,
+    ):
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Prepare GLIGEN inputs
+        if 'gligen' in cross_attention_kwargs:
+            cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+            gligen_kwargs = cross_attention_kwargs.pop('gligen', None)
+        else:
+            cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+            gligen_kwargs = None
+        # 1. Self-Attention
+        if self.use_ada_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.use_ada_layer_norm_zero:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        else:
+            norm_hidden_states = self.norm1(hidden_states)
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+        if self.use_ada_layer_norm_zero:
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = attn_output + hidden_states
+        # 1.5 GLIGEN Control
+        if gligen_kwargs is not None:
+            # print(gligen_kwargs)
+            hidden_states = self.fuser(hidden_states, gligen_kwargs['objs'], fuser_attn_kwargs=gligen_kwargs.get("fuser_attn_kwargs", {}))
+        # 1.5 ends
+        # 2. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = (
+                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+            )
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                return_attntion_probs=return_cross_attention_probs,
+                **cross_attention_kwargs,
+            )
+            if return_cross_attention_probs:
+                attn_output, cross_attention_probs = attn_output
+            hidden_states = attn_output + hidden_states
+        # 3. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+        if self.use_ada_layer_norm_zero:
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        ff_output = self.ff(norm_hidden_states)
+        if self.use_ada_layer_norm_zero:
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = ff_output + hidden_states
+        if return_cross_attention_probs and self.attn2 is not None:
+            return hidden_states, cross_attention_probs
+        return hidden_states
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh")
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states):
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
+class GELU(nn.Module):
+    r"""
+    GELU activation function with tanh approximation support with `approximate="tanh"`.
+    """
+    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none"):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out)
+        self.approximate = approximate
+    def gelu(self, gate):
+        if gate.device.type != "mps":
+            return F.gelu(gate, approximate=self.approximate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        return hidden_states
+class GEGLU(nn.Module):
+    r"""
+    A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+    def gelu(self, gate):
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+    def forward(self, hidden_states):
+        hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
+        return hidden_states * self.gelu(gate)
+class ApproximateGELU(nn.Module):
+    """
+    The approximate form of Gaussian Error Linear Unit (GELU)
+    For more details, see section 2: https://arxiv.org/abs/1606.08415
+    """
+    def __init__(self, dim_in: int, dim_out: int):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out)
+    def forward(self, x):
+        x = self.proj(x)
+        return x * torch.sigmoid(1.702 * x)
+class AdaLayerNorm(nn.Module):
+    """
+    Norm layer modified to incorporate timestep embeddings.
+    """
+    def __init__(self, embedding_dim, num_embeddings):
+        super().__init__()
+        self.emb = nn.Embedding(num_embeddings, embedding_dim)
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
+        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False)
+    def forward(self, x, timestep):
+        emb = self.linear(self.silu(self.emb(timestep)))
+        scale, shift = torch.chunk(emb, 2)
+        x = self.norm(x) * (1 + scale) + shift
+        return x
+class AdaLayerNormZero(nn.Module):
+    """
+    Norm layer adaptive layer norm zero (adaLN-Zero).
+    """
+    def __init__(self, embedding_dim, num_embeddings):
+        super().__init__()
+        self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, embedding_dim)
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+    def forward(self, x, timestep, class_labels, hidden_dtype=None):
+        emb = self.linear(self.silu(self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=1)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp

models/attention_processor.py ADDED Viewed

	@@ -0,0 +1,508 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from typing import Callable, Optional, Union
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.utils import deprecate, logging, maybe_allow_in_graph
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@maybe_allow_in_graph
+class Attention(nn.Module):
+    r"""
+    A cross attention layer.
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8): The number of heads to use for multi-head attention.
+        dim_head (`int`,  *optional*, defaults to 64): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias=False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        cross_attention_norm_num_groups: int = 32,
+        added_kv_proj_dim: Optional[int] = None,
+        norm_num_groups: Optional[int] = None,
+        spatial_norm_dim: Optional[int] = None,
+        out_bias: bool = True,
+        scale_qk: bool = True,
+        only_cross_attention: bool = False,
+        eps: float = 1e-5,
+        rescale_output_factor: float = 1.0,
+        residual_connection: bool = False,
+        _from_deprecated_attn_block=False,
+        processor: Optional["AttnProcessor"] = None,
+    ):
+        super().__init__()
+        inner_dim = dim_head * heads
+        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.rescale_output_factor = rescale_output_factor
+        self.residual_connection = residual_connection
+        # we make use of this private variable to know whether this class is loaded
+        # with an deprecated state dict so that we can convert it on the fly
+        self._from_deprecated_attn_block = _from_deprecated_attn_block
+        self.scale_qk = scale_qk
+        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
+        self.heads = heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.only_cross_attention = only_cross_attention
+        if self.added_kv_proj_dim is None and self.only_cross_attention:
+            raise ValueError(
+                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
+            )
+        if norm_num_groups is not None:
+            self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True)
+        else:
+            self.group_norm = None
+        if spatial_norm_dim is not None:
+            self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim)
+        else:
+            self.spatial_norm = None
+        if cross_attention_norm is None:
+            self.norm_cross = None
+        elif cross_attention_norm == "layer_norm":
+            self.norm_cross = nn.LayerNorm(cross_attention_dim)
+        elif cross_attention_norm == "group_norm":
+            if self.added_kv_proj_dim is not None:
+                # The given `encoder_hidden_states` are initially of shape
+                # (batch_size, seq_len, added_kv_proj_dim) before being projected
+                # to (batch_size, seq_len, cross_attention_dim). The norm is applied
+                # before the projection, so we need to use `added_kv_proj_dim` as
+                # the number of channels for the group norm.
+                norm_cross_num_channels = added_kv_proj_dim
+            else:
+                norm_cross_num_channels = cross_attention_dim
+            self.norm_cross = nn.GroupNorm(
+                num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True
+            )
+        else:
+            raise ValueError(
+                f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
+            )
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
+        if not self.only_cross_attention:
+            # only relevant for the `AddedKVProcessor` classes
+            self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+            self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=bias)
+        else:
+            self.to_k = None
+            self.to_v = None
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, inner_dim)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, inner_dim)
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(inner_dim, query_dim, bias=out_bias))
+        self.to_out.append(nn.Dropout(dropout))
+        # set attention processor
+        # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+        # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+        # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+        if processor is None:
+            # processor = (
+            #     AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+            # )
+            # Note: efficient attention is not used. We can use efficient attention to speed up.
+            processor = AttnProcessor()
+        self.set_processor(processor)
+    def set_processor(self, processor: "AttnProcessor"):
+        # if current processor is in `self._modules` and if passed `processor` is not, we need to
+        # pop `processor` from `self._modules`
+        if (
+            hasattr(self, "processor")
+            and isinstance(self.processor, torch.nn.Module)
+            and not isinstance(processor, torch.nn.Module)
+        ):
+            logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
+            self._modules.pop("processor")
+        self.processor = processor
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, return_attntion_probs=False, **cross_attention_kwargs):
+        # The `Attention` class can call different attention processors / attention functions
+        # here we simply pass along all tensors to the selected processor class
+        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+        return self.processor(
+            self,
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            return_attntion_probs=return_attntion_probs,
+            **cross_attention_kwargs,
+        )
+    def batch_to_head_dim(self, tensor):
+        head_size = self.heads
+        batch_size, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
+        return tensor
+    def head_to_batch_dim(self, tensor, out_dim=3):
+        head_size = self.heads
+        batch_size, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3)
+        if out_dim == 3:
+            tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)
+        return tensor
+    def get_attention_scores(self, query, key, attention_mask=None):
+        dtype = query.dtype
+        if self.upcast_attention:
+            query = query.float()
+            key = key.float()
+        if attention_mask is None:
+            baddbmm_input = torch.empty(
+                query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
+            )
+            beta = 0
+        else:
+            baddbmm_input = attention_mask
+            beta = 1
+        attention_scores = torch.baddbmm(
+            baddbmm_input,
+            query,
+            key.transpose(-1, -2),
+            beta=beta,
+            alpha=self.scale,
+        )
+        del baddbmm_input
+        if self.upcast_softmax:
+            attention_scores = attention_scores.float()
+        attention_probs = attention_scores.softmax(dim=-1)
+        del attention_scores
+        attention_probs = attention_probs.to(dtype)
+        return attention_probs
+    def prepare_attention_mask(self, attention_mask, target_length, batch_size=None, out_dim=3):
+        if batch_size is None:
+            deprecate(
+                "batch_size=None",
+                "0.0.15",
+                (
+                    "Not passing the `batch_size` parameter to `prepare_attention_mask` can lead to incorrect"
+                    " attention mask preparation and is deprecated behavior. Please make sure to pass `batch_size` to"
+                    " `prepare_attention_mask` when preparing the attention_mask."
+                ),
+            )
+            batch_size = 1
+        head_size = self.heads
+        if attention_mask is None:
+            return attention_mask
+        current_length: int = attention_mask.shape[-1]
+        if current_length != target_length:
+            if attention_mask.device.type == "mps":
+                # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+                # Instead, we can manually construct the padding tensor.
+                padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
+                padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
+                attention_mask = torch.cat([attention_mask, padding], dim=2)
+            else:
+                # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+                #       we want to instead pad by (0, remaining_length), where remaining_length is:
+                #       remaining_length: int = target_length - current_length
+                # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+        return attention_mask
+    def norm_encoder_hidden_states(self, encoder_hidden_states):
+        assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
+        if isinstance(self.norm_cross, nn.LayerNorm):
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+        elif isinstance(self.norm_cross, nn.GroupNorm):
+            # Group norm norms along the channels dimension and expects
+            # input to be in the shape of (N, C, *). In this case, we want
+            # to norm along the hidden dimension, so we need to move
+            # (batch_size, sequence_length, hidden_size) ->
+            # (batch_size, hidden_size, sequence_length)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+        else:
+            assert False
+        return encoder_hidden_states
+class AttnProcessor:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call_fast__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        inner_dim = hidden_states.shape[-1]
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        return_attntion_probs=False,
+        attn_key=None,
+        attn_process_fn=None,
+        return_cond_ca_only=False,
+        return_token_ca_only=None,
+        offload_cross_attn_to_cpu=False,
+        save_attn_to_dict=None,
+        save_keys=None,
+        enable_flash_attn=True,
+    ):
+        """
+        attn_key: current key (a tuple of hierarchy index (up/mid/down, stage id, block id, sub-block id), sub block id should always be 0 in SD UNet)
+        save_attn_to_dict: pass in a dict to save to dict
+        """
+        cross_attn = encoder_hidden_states is not None
+        if (not cross_attn) or (
+            (attn_process_fn is None)
+            and not (save_attn_to_dict is not None and (save_keys is None or (tuple(attn_key) in save_keys)))
+            and not return_attntion_probs):
+            with torch.backends.cuda.sdp_kernel(enable_flash=enable_flash_attn, enable_math=True, enable_mem_efficient=enable_flash_attn):
+                return self.__call_fast__(attn, hidden_states, encoder_hidden_states, attention_mask, temb)
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        # Currently only process cross-attention
+        if attn_process_fn is not None and cross_attn:
+            attention_probs_before_process = attention_probs.clone()
+            attention_probs = attn_process_fn(attention_probs, query, key, value, attn_key=attn_key, cross_attn=cross_attn, batch_size=batch_size, heads=attn.heads)
+        else:
+            attention_probs_before_process = attention_probs
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        if return_attntion_probs or save_attn_to_dict is not None:
+            # Recover batch dimension: (batch_size, heads, flattened_2d, text_tokens)
+            attention_probs_unflattened = attention_probs_before_process.unflatten(dim=0, sizes=(batch_size, attn.heads))
+            if return_token_ca_only is not None:
+                # (batch size, n heads, 2d dimension, num text tokens)
+                if isinstance(return_token_ca_only, int):
+                    # return_token_ca_only: an integer
+                    attention_probs_unflattened = attention_probs_unflattened[:, :, :, return_token_ca_only:return_token_ca_only+1]
+                else:
+                    # return_token_ca_only: A 1d index tensor
+                    attention_probs_unflattened = attention_probs_unflattened[:, :, :, return_token_ca_only]
+            if return_cond_ca_only:
+                assert batch_size % 2 == 0, f"Samples are not in pairs: {batch_size} samples"
+                attention_probs_unflattened = attention_probs_unflattened[batch_size // 2:]
+            if offload_cross_attn_to_cpu:
+                attention_probs_unflattened = attention_probs_unflattened.cpu()
+            if save_attn_to_dict is not None and (save_keys is None or (tuple(attn_key) in save_keys)):
+                save_attn_to_dict[tuple(attn_key)] = attention_probs_unflattened
+            if return_attntion_probs:
+                return hidden_states, attention_probs_unflattened
+        return hidden_states
+# For typing
+AttentionProcessor = AttnProcessor
+class SpatialNorm(nn.Module):
+    """
+    Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002
+    """
+    def __init__(
+        self,
+        f_channels,
+        zq_channels,
+    ):
+        super().__init__()
+        self.norm_layer = nn.GroupNorm(num_channels=f_channels, num_groups=32, eps=1e-6, affine=True)
+        self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+        self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, f, zq):
+        f_size = f.shape[-2:]
+        zq = F.interpolate(zq, size=f_size, mode="nearest")
+        norm_f = self.norm_layer(f)
+        new_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
+        return new_f

models/modeling_utils.py ADDED Viewed

	@@ -0,0 +1,874 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import itertools
+import os
+from functools import partial
+from typing import Any, Callable, List, Optional, Tuple, Union
+import torch
+from torch import Tensor, device
+from diffusers import __version__
+from diffusers.utils import (
+    CONFIG_NAME,
+    DIFFUSERS_CACHE,
+    FLAX_WEIGHTS_NAME,
+    HF_HUB_OFFLINE,
+    SAFETENSORS_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    _add_variant,
+    _get_model_file,
+    deprecate,
+    is_accelerate_available,
+    is_safetensors_available,
+    is_torch_version,
+    logging,
+)
+logger = logging.get_logger(__name__)
+if is_torch_version(">=", "1.9.0"):
+    _LOW_CPU_MEM_USAGE_DEFAULT = True
+else:
+    _LOW_CPU_MEM_USAGE_DEFAULT = False
+if is_accelerate_available():
+    import accelerate
+    from accelerate.utils import set_module_tensor_to_device
+    from accelerate.utils.versions import is_torch_version
+if is_safetensors_available():
+    import safetensors
+def get_parameter_device(parameter: torch.nn.Module):
+    try:
+        parameters_and_buffers = itertools.chain(parameter.parameters(), parameter.buffers())
+        return next(parameters_and_buffers).device
+    except StopIteration:
+        # For torch.nn.DataParallel compatibility in PyTorch 1.5
+        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        first_tuple = next(gen)
+        return first_tuple[1].device
+def get_parameter_dtype(parameter: torch.nn.Module):
+    try:
+        params = tuple(parameter.parameters())
+        if len(params) > 0:
+            return params[0].dtype
+        buffers = tuple(parameter.buffers())
+        if len(buffers) > 0:
+            return buffers[0].dtype
+    except StopIteration:
+        # For torch.nn.DataParallel compatibility in PyTorch 1.5
+        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        first_tuple = next(gen)
+        return first_tuple[1].dtype
+def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[str] = None):
+    """
+    Reads a checkpoint file, returning properly formatted errors if they arise.
+    """
+    try:
+        if os.path.basename(checkpoint_file) == _add_variant(WEIGHTS_NAME, variant):
+            return torch.load(checkpoint_file, map_location="cpu")
+        else:
+            return safetensors.torch.load_file(checkpoint_file, device="cpu")
+    except Exception as e:
+        try:
+            with open(checkpoint_file) as f:
+                if f.read().startswith("version"):
+                    raise OSError(
+                        "You seem to have cloned a repository without having git-lfs installed. Please install "
+                        "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
+                        "you cloned."
+                    )
+                else:
+                    raise ValueError(
+                        f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained "
+                        "model. Make sure you have saved the model properly."
+                    ) from e
+        except (UnicodeDecodeError, ValueError):
+            raise OSError(
+                f"Unable to load weights from checkpoint file for '{checkpoint_file}' "
+                f"at '{checkpoint_file}'. "
+                "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True."
+            )
+def _load_state_dict_into_model(model_to_load, state_dict):
+    # Convert old format to new format if needed from a PyTorch state_dict
+    # copy state_dict so _load_from_state_dict can modify it
+    state_dict = state_dict.copy()
+    error_msgs = []
+    # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
+    # so we need to apply the function recursively.
+    def load(module: torch.nn.Module, prefix=""):
+        args = (state_dict, prefix, {}, True, [], [], error_msgs)
+        module._load_from_state_dict(*args)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + ".")
+    load(model_to_load)
+    return error_msgs
+class ModelMixin(torch.nn.Module):
+    r"""
+    Base class for all models.
+    [`ModelMixin`] takes care of storing the configuration of the models and handles methods for loading, downloading
+    and saving models.
+        - **config_name** ([`str`]) -- A filename under which the model should be stored when calling
+          [`~models.ModelMixin.save_pretrained`].
+    """
+    config_name = CONFIG_NAME
+    _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
+    _supports_gradient_checkpointing = False
+    def __init__(self):
+        super().__init__()
+    def __getattr__(self, name: str) -> Any:
+        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
+        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129 We need to overwrite
+        __getattr__ here in addition so that we don't trigger `torch.nn.Module`'s __getattr__':
+        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        """
+        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
+        is_attribute = name in self.__dict__
+        if is_in_config and not is_attribute:
+            deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'unet.config.{name}'."
+            deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False, stacklevel=3)
+            return self._internal_dict[name]
+        # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        return super().__getattr__(name)
+    @property
+    def is_gradient_checkpointing(self) -> bool:
+        """
+        Whether gradient checkpointing is activated for this model or not.
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules())
+    def enable_gradient_checkpointing(self):
+        """
+        Activates gradient checkpointing for the current model.
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if not self._supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+    def disable_gradient_checkpointing(self):
+        """
+        Deactivates gradient checkpointing for the current model.
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if self._supports_gradient_checkpointing:
+            self.apply(partial(self._set_gradient_checkpointing, value=False))
+    def set_use_memory_efficient_attention_xformers(
+        self, valid: bool, attention_op: Optional[Callable] = None
+    ) -> None:
+        # Recursively walk through all the children.
+        # Any children which exposes the set_use_memory_efficient_attention_xformers method
+        # gets the message
+        def fn_recursive_set_mem_eff(module: torch.nn.Module):
+            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
+                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
+            for child in module.children():
+                fn_recursive_set_mem_eff(child)
+        for module in self.children():
+            if isinstance(module, torch.nn.Module):
+                fn_recursive_set_mem_eff(module)
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        r"""
+        Enable memory efficient attention as implemented in xformers.
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
+        time. Speed up at training time is not guaranteed.
+        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
+        is used.
+        Parameters:
+            attention_op (`Callable`, *optional*):
+                Override the default `None` operator for use as `op` argument to the
+                [`memory_efficient_attention()`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention)
+                function of xFormers.
+        Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import UNet2DConditionModel
+        >>> from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
+        >>> model = UNet2DConditionModel.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-1", subfolder="unet", torch_dtype=torch.float16
+        ... )
+        >>> model = model.to("cuda")
+        >>> model.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
+        ```
+        """
+        self.set_use_memory_efficient_attention_xformers(True, attention_op)
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention as implemented in xformers.
+        """
+        self.set_use_memory_efficient_attention_xformers(False)
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        save_function: Callable = None,
+        safe_serialization: bool = False,
+        variant: Optional[str] = None,
+    ):
+        """
+        Save a model and its configuration file to a directory, so that it can be re-loaded using the
+        `[`~models.ModelMixin.from_pretrained`]` class method.
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to which to save. Will be created if it doesn't exist.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful when in distributed training like
+                TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
+                the main process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
+                need to replace `torch.save` by another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `False`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+            variant (`str`, *optional*):
+                If specified, weights are saved in the format pytorch_model.<variant>.bin.
+        """
+        if safe_serialization and not is_safetensors_available():
+            raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.")
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+        os.makedirs(save_directory, exist_ok=True)
+        model_to_save = self
+        # Attach architecture to the config
+        # Save the config
+        if is_main_process:
+            model_to_save.save_config(save_directory)
+        # Save the model
+        state_dict = model_to_save.state_dict()
+        weights_name = SAFETENSORS_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
+        weights_name = _add_variant(weights_name, variant)
+        # Save the model
+        if safe_serialization:
+            safetensors.torch.save_file(
+                state_dict, os.path.join(save_directory, weights_name), metadata={"format": "pt"}
+            )
+        else:
+            torch.save(state_dict, os.path.join(save_directory, weights_name))
+        logger.info(f"Model weights saved in {os.path.join(save_directory, weights_name)}")
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        r"""
+        Instantiate a pretrained pytorch model from a pre-trained model configuration.
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
+        the model, you should first set it back in training mode with `model.train()`.
+        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
+        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
+        task.
+        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
+        weights are discarded.
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids should have an organization name, like `google/ddpm-celebahq-256`.
+                    - A path to a *directory* containing model weights saved using [`~ModelMixin.save_config`], e.g.,
+                      `./my_model_directory/`.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
+                will be automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `diffusers-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+            from_flax (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a Flax checkpoint save file.
+            subfolder (`str`, *optional*, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo (either remote in
+                huggingface.co or downloaded locally), you can specify the folder name here.
+            mirror (`str`, *optional*):
+                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
+                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
+                Please refer to the mirror site for more information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be refined to each
+                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
+                same device.
+                To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier to maximum memory. Will default to the maximum memory available for each
+                GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, *optional*):
+                If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
+            offload_state_dict (`bool`, *optional*):
+                If `True`, will temporarily offload the CPU state dict to the hard drive to avoid getting out of CPU
+                RAM if the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to
+                `True` when there is some disk offload.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
+                also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
+                model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
+                setting this argument to `True` will raise an error.
+            variant (`str`, *optional*):
+                If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin. `variant` is
+                ignored when using `from_flax`.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the `safetensors` weights will be downloaded if they're available **and** if the
+                `safetensors` library is installed. If set to `True`, the model will be forcibly loaded from
+                `safetensors` weights. If set to `False`, loading will *not* use `safetensors`.
+        <Tip>
+         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
+         models](https://huggingface.co/docs/hub/models-gated#gated-models).
+        </Tip>
+        <Tip>
+        Activate the special ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use
+        this method in a firewalled environment.
+        </Tip>
+        """
+        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
+        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
+        force_download = kwargs.pop("force_download", False)
+        from_flax = kwargs.pop("from_flax", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
+        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        subfolder = kwargs.pop("subfolder", None)
+        device_map = kwargs.pop("device_map", None)
+        max_memory = kwargs.pop("max_memory", None)
+        offload_folder = kwargs.pop("offload_folder", None)
+        offload_state_dict = kwargs.pop("offload_state_dict", False)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+        variant = kwargs.pop("variant", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        if use_safetensors and not is_safetensors_available():
+            raise ValueError(
+                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
+            )
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = is_safetensors_available()
+            allow_pickle = True
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warning(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+        if device_map is not None and not is_accelerate_available():
+            raise NotImplementedError(
+                "Loading and dispatching requires `accelerate`. Please make sure to install accelerate or set"
+                " `device_map=None`. You can install accelerate with `pip install accelerate`."
+            )
+        # Check if we can handle device_map and dispatching the weights
+        if device_map is not None and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `device_map=None`."
+            )
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+        if low_cpu_mem_usage is False and device_map is not None:
+            raise ValueError(
+                f"You cannot set `low_cpu_mem_usage` to `False` while using device_map={device_map} for loading and"
+                " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
+            )
+        # Load config if we don't provide a configuration
+        config_path = pretrained_model_name_or_path
+        user_agent = {
+            "diffusers": __version__,
+            "file_type": "model",
+            "framework": "pytorch",
+        }
+        # load config
+        config, unused_kwargs, commit_hash = cls.load_config(
+            config_path,
+            cache_dir=cache_dir,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            use_auth_token=use_auth_token,
+            revision=revision,
+            subfolder=subfolder,
+            device_map=device_map,
+            max_memory=max_memory,
+            offload_folder=offload_folder,
+            offload_state_dict=offload_state_dict,
+            user_agent=user_agent,
+            **kwargs,
+        )
+        # load model
+        model_file = None
+        if from_flax:
+            model_file = _get_model_file(
+                pretrained_model_name_or_path,
+                weights_name=FLAX_WEIGHTS_NAME,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                subfolder=subfolder,
+                user_agent=user_agent,
+                commit_hash=commit_hash,
+            )
+            model = cls.from_config(config, **unused_kwargs)
+            # Convert the weights
+            from diffusers.models.modeling_pytorch_flax_utils import load_flax_checkpoint_in_pytorch_model
+            model = load_flax_checkpoint_in_pytorch_model(model, model_file)
+        else:
+            if use_safetensors:
+                try:
+                    model_file = _get_model_file(
+                        pretrained_model_name_or_path,
+                        weights_name=_add_variant(SAFETENSORS_WEIGHTS_NAME, variant),
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        resume_download=resume_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        use_auth_token=use_auth_token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                        commit_hash=commit_hash,
+                    )
+                except IOError as e:
+                    if not allow_pickle:
+                        raise e
+                    pass
+            if model_file is None:
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path,
+                    weights_name=_add_variant(WEIGHTS_NAME, variant),
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    use_auth_token=use_auth_token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                    commit_hash=commit_hash,
+                )
+            if low_cpu_mem_usage:
+                # Instantiate model with empty weights
+                with accelerate.init_empty_weights():
+                    model = cls.from_config(config, **unused_kwargs)
+                # if device_map is None, load the state dict and move the params from meta device to the cpu
+                if device_map is None:
+                    param_device = "cpu"
+                    state_dict = load_state_dict(model_file, variant=variant)
+                    model._convert_deprecated_attention_blocks(state_dict)
+                    # move the params from meta device to cpu
+                    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+                    if len(missing_keys) > 0:
+                        raise ValueError(
+                            f"Cannot load {cls} from {pretrained_model_name_or_path} because the following keys are"
+                            f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass"
+                            " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomly initialize"
+                            " those weights or else make sure your checkpoint file is correct."
+                        )
+                    empty_state_dict = model.state_dict()
+                    for param_name, param in state_dict.items():
+                        accepts_dtype = "dtype" in set(
+                            inspect.signature(set_module_tensor_to_device).parameters.keys()
+                        )
+                        if empty_state_dict[param_name].shape != param.shape:
+                            raise ValueError(
+                                f"Cannot load {pretrained_model_name_or_path} because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
+                            )
+                        if accepts_dtype:
+                            set_module_tensor_to_device(
+                                model, param_name, param_device, value=param, dtype=torch_dtype
+                            )
+                        else:
+                            set_module_tensor_to_device(model, param_name, param_device, value=param)
+                else:  # else let accelerate handle loading and dispatching.
+                    # Load weights and dispatch according to the device_map
+                    # by default the device_map is None and the weights are loaded on the CPU
+                    accelerate.load_checkpoint_and_dispatch(
+                        model,
+                        model_file,
+                        device_map,
+                        max_memory=max_memory,
+                        offload_folder=offload_folder,
+                        offload_state_dict=offload_state_dict,
+                        dtype=torch_dtype,
+                    )
+                loading_info = {
+                    "missing_keys": [],
+                    "unexpected_keys": [],
+                    "mismatched_keys": [],
+                    "error_msgs": [],
+                }
+            else:
+                model = cls.from_config(config, **unused_kwargs)
+                state_dict = load_state_dict(model_file, variant=variant)
+                model._convert_deprecated_attention_blocks(state_dict)
+                model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
+                    model,
+                    state_dict,
+                    model_file,
+                    pretrained_model_name_or_path,
+                    ignore_mismatched_sizes=ignore_mismatched_sizes,
+                )
+                loading_info = {
+                    "missing_keys": missing_keys,
+                    "unexpected_keys": unexpected_keys,
+                    "mismatched_keys": mismatched_keys,
+                    "error_msgs": error_msgs,
+                }
+        if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype):
+            raise ValueError(
+                f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}."
+            )
+        elif torch_dtype is not None:
+            model = model.to(torch_dtype)
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+        # Set model in evaluation mode to deactivate DropOut modules by default
+        model.eval()
+        if output_loading_info:
+            return model, loading_info
+        return model
+    @classmethod
+    def _load_pretrained_model(
+        cls,
+        model,
+        state_dict,
+        resolved_archive_file,
+        pretrained_model_name_or_path,
+        ignore_mismatched_sizes=False,
+    ):
+        # Retrieve missing & unexpected_keys
+        model_state_dict = model.state_dict()
+        loaded_keys = list(state_dict.keys())
+        expected_keys = list(model_state_dict.keys())
+        original_loaded_keys = loaded_keys
+        missing_keys = list(set(expected_keys) - set(loaded_keys))
+        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
+        # Make sure we are able to load base models as well as derived models (with heads)
+        model_to_load = model
+        def _find_mismatched_keys(
+            state_dict,
+            model_state_dict,
+            loaded_keys,
+            ignore_mismatched_sizes,
+        ):
+            mismatched_keys = []
+            if ignore_mismatched_sizes:
+                for checkpoint_key in loaded_keys:
+                    model_key = checkpoint_key
+                    if (
+                        model_key in model_state_dict
+                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
+                    ):
+                        mismatched_keys.append(
+                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
+                        )
+                        del state_dict[checkpoint_key]
+            return mismatched_keys
+        if state_dict is not None:
+            # Whole checkpoint
+            mismatched_keys = _find_mismatched_keys(
+                state_dict,
+                model_state_dict,
+                original_loaded_keys,
+                ignore_mismatched_sizes,
+            )
+            error_msgs = _load_state_dict_into_model(model_to_load, state_dict)
+        if len(error_msgs) > 0:
+            error_msg = "\n\t".join(error_msgs)
+            if "size mismatch" in error_msg:
+                error_msg += (
+                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
+                )
+            raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
+                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
+                " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
+                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
+                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
+                " identical (initializing a BertForSequenceClassification model from a"
+                " BertForSequenceClassification model)."
+            )
+        else:
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+        elif len(mismatched_keys) == 0:
+            logger.info(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
+                f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
+                " without further training."
+            )
+        if len(mismatched_keys) > 0:
+            mismatched_warning = "\n".join(
+                [
+                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                    for key, shape1, shape2 in mismatched_keys
+                ]
+            )
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
+                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
+                " able to use it for predictions and inference."
+            )
+        return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
+    @property
+    def device(self) -> device:
+        """
+        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+        return get_parameter_device(self)
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        """
+        return get_parameter_dtype(self)
+    def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
+        """
+        Get number of (optionally, trainable or non-embeddings) parameters in the module.
+        Args:
+            only_trainable (`bool`, *optional*, defaults to `False`):
+                Whether or not to return only the number of trainable parameters
+            exclude_embeddings (`bool`, *optional*, defaults to `False`):
+                Whether or not to return only the number of non-embeddings parameters
+        Returns:
+            `int`: The number of parameters.
+        """
+        if exclude_embeddings:
+            embedding_param_names = [
+                f"{name}.weight"
+                for name, module_type in self.named_modules()
+                if isinstance(module_type, torch.nn.Embedding)
+            ]
+            non_embedding_parameters = [
+                parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
+            ]
+            return sum(p.numel() for p in non_embedding_parameters if p.requires_grad or not only_trainable)
+        else:
+            return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable)
+    def _convert_deprecated_attention_blocks(self, state_dict):
+        deprecated_attention_block_paths = []
+        def recursive_find_attn_block(name, module):
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_paths.append(name)
+            for sub_name, sub_module in module.named_children():
+                sub_name = sub_name if name == "" else f"{name}.{sub_name}"
+                recursive_find_attn_block(sub_name, sub_module)
+        recursive_find_attn_block("", self)
+        # NOTE: we have to check if the deprecated parameters are in the state dict
+        # because it is possible we are loading from a state dict that was already
+        # converted
+        for path in deprecated_attention_block_paths:
+            # group_norm path stays the same
+            # query -> to_q
+            if f"{path}.query.weight" in state_dict:
+                state_dict[f"{path}.to_q.weight"] = state_dict.pop(f"{path}.query.weight")
+            if f"{path}.query.bias" in state_dict:
+                state_dict[f"{path}.to_q.bias"] = state_dict.pop(f"{path}.query.bias")
+            # key -> to_k
+            if f"{path}.key.weight" in state_dict:
+                state_dict[f"{path}.to_k.weight"] = state_dict.pop(f"{path}.key.weight")
+            if f"{path}.key.bias" in state_dict:
+                state_dict[f"{path}.to_k.bias"] = state_dict.pop(f"{path}.key.bias")
+            # value -> to_v
+            if f"{path}.value.weight" in state_dict:
+                state_dict[f"{path}.to_v.weight"] = state_dict.pop(f"{path}.value.weight")
+            if f"{path}.value.bias" in state_dict:
+                state_dict[f"{path}.to_v.bias"] = state_dict.pop(f"{path}.value.bias")
+            # proj_attn -> to_out.0
+            if f"{path}.proj_attn.weight" in state_dict:
+                state_dict[f"{path}.to_out.0.weight"] = state_dict.pop(f"{path}.proj_attn.weight")
+            if f"{path}.proj_attn.bias" in state_dict:
+                state_dict[f"{path}.to_out.0.bias"] = state_dict.pop(f"{path}.proj_attn.bias")

models/models.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import torch
+from transformers import CLIPTextModel, CLIPTokenizer
+from diffusers import AutoencoderKL, DDIMScheduler, DDIMInverseScheduler, DPMSolverMultistepScheduler
+from .unet_2d_condition import UNet2DConditionModel
+from easydict import EasyDict
+import numpy as np
+# For compatibility
+from utils.latents import get_unscaled_latents, get_scaled_latents, blend_latents
+from utils import torch_device
+def load_sd(key="runwayml/stable-diffusion-v1-5", use_fp16=False, load_inverse_scheduler=True, use_dpm_multistep_scheduler=False):
+    """
+    Keys:
+     key = "CompVis/stable-diffusion-v1-4"
+     key = "runwayml/stable-diffusion-v1-5"
+     key = "stabilityai/stable-diffusion-2-1-base"
+    Unpack with:
+    ```
+    model_dict = load_sd(key=key, use_fp16=use_fp16)
+    vae, tokenizer, text_encoder, unet, scheduler, dtype = model_dict.vae, model_dict.tokenizer, model_dict.text_encoder, model_dict.unet, model_dict.scheduler, model_dict.dtype
+    ```
+    use_fp16: fp16 might have degraded performance
+    use_dpm_multistep_scheduler: DPMSolverMultistepScheduler
+    """
+    # run final results in fp32
+    if use_fp16:
+        dtype = torch.float16
+        revision = "fp16"
+    else:
+        dtype = torch.float
+        revision = "main"
+    vae = AutoencoderKL.from_pretrained(key, subfolder="vae", revision=revision, torch_dtype=dtype).to(torch_device)
+    tokenizer = CLIPTokenizer.from_pretrained(key, subfolder="tokenizer", revision=revision, torch_dtype=dtype)
+    text_encoder = CLIPTextModel.from_pretrained(key, subfolder="text_encoder", revision=revision, torch_dtype=dtype).to(torch_device)
+    unet = UNet2DConditionModel.from_pretrained(key, subfolder="unet", revision=revision, torch_dtype=dtype).to(torch_device)
+    if use_dpm_multistep_scheduler:
+        scheduler = DPMSolverMultistepScheduler.from_pretrained(key, subfolder="scheduler", revision=revision, torch_dtype=dtype)
+    else:
+        scheduler = DDIMScheduler.from_pretrained(key, subfolder="scheduler", revision=revision, torch_dtype=dtype)
+    model_dict = EasyDict(vae=vae, tokenizer=tokenizer, text_encoder=text_encoder, unet=unet, scheduler=scheduler, dtype=dtype)
+    if load_inverse_scheduler:
+        inverse_scheduler = DDIMInverseScheduler.from_config(scheduler.config)
+        model_dict.inverse_scheduler = inverse_scheduler
+    return model_dict
+def encode_prompts(tokenizer, text_encoder, prompts, negative_prompt="", return_full_only=False, one_uncond_input_only=False):
+    if negative_prompt == "":
+        print("Note that negative_prompt is an empty string")
+    text_input = tokenizer(
+        prompts, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt"
+    )
+    max_length = text_input.input_ids.shape[-1]
+    if one_uncond_input_only:
+        num_uncond_input = 1
+    else:
+        num_uncond_input = len(prompts)
+    uncond_input = tokenizer([negative_prompt] * num_uncond_input, padding="max_length", max_length=max_length, return_tensors="pt")
+    with torch.no_grad():
+        uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]
+        cond_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
+    if one_uncond_input_only:
+        return uncond_embeddings, cond_embeddings
+    text_embeddings = torch.cat([uncond_embeddings, cond_embeddings])
+    if return_full_only:
+        return text_embeddings
+    return text_embeddings, uncond_embeddings, cond_embeddings
+def attn_list_to_tensor(cross_attention_probs):
+    # timestep, CrossAttnBlock, Transformer2DModel, 1xBasicTransformerBlock
+    num_cross_attn_block = len(cross_attention_probs[0])
+    cross_attention_probs_all = []
+    for i in range(num_cross_attn_block):
+        # cross_attention_probs_timestep[i]: Transformer2DModel
+        # 1xBasicTransformerBlock is skipped
+        cross_attention_probs_current = []
+        for cross_attention_probs_timestep in cross_attention_probs:
+            cross_attention_probs_current.append(torch.stack([item for item in cross_attention_probs_timestep[i]], dim=0))
+        cross_attention_probs_current = torch.stack(cross_attention_probs_current, dim=0)
+        cross_attention_probs_all.append(cross_attention_probs_current)
+    return cross_attention_probs_all

models/pipelines.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import torch
+from tqdm import tqdm
+import utils
+from PIL import Image
+import gc
+import numpy as np
+from .attention import GatedSelfAttentionDense
+from .models import torch_device
+@torch.no_grad()
+def encode(model_dict, image, generator):
+    """
+    image should be a PIL object or numpy array with range 0 to 255
+    """
+    vae, dtype = model_dict.vae, model_dict.dtype
+    if isinstance(image, Image.Image):
+        w, h = image.size
+        assert w % 8 == 0 and h % 8 == 0, f"h ({h}) and w ({w}) should be a multiple of 8"
+        # w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+        # image = np.array(image.resize((w, h), resample=Image.Resampling.LANCZOS))[None, :]
+        image = np.array(image)
+    if isinstance(image, np.ndarray):
+        assert image.dtype == np.uint8, f"Should have dtype uint8 (dtype: {image.dtype})"
+        image = image.astype(np.float32) / 255.0
+        image = image[None, ...]
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    assert isinstance(image, torch.Tensor), f"type of image: {type(image)}"
+    image = image.to(device=torch_device, dtype=dtype)
+    latents = vae.encode(image).latent_dist.sample(generator)
+    latents = vae.config.scaling_factor * latents
+    return latents
+@torch.no_grad()
+def decode(vae, latents):
+    # scale and decode the image latents with vae
+    scaled_latents = 1 / 0.18215 * latents
+    with torch.no_grad():
+        image = vae.decode(scaled_latents).sample
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+    images = (image * 255).round().astype("uint8")
+    return images
+@torch.no_grad()
+def generate(model_dict, latents, input_embeddings, num_inference_steps, guidance_scale = 7.5, no_set_timesteps=False):
+    vae, tokenizer, text_encoder, unet, scheduler, dtype = model_dict.vae, model_dict.tokenizer, model_dict.text_encoder, model_dict.unet, model_dict.scheduler, model_dict.dtype
+    text_embeddings, uncond_embeddings, cond_embeddings = input_embeddings
+    if not no_set_timesteps:
+        scheduler.set_timesteps(num_inference_steps)
+    for t in tqdm(scheduler.timesteps):
+        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+        latent_model_input = torch.cat([latents] * 2)
+        latent_model_input = scheduler.scale_model_input(latent_model_input, timestep=t)
+        # predict the noise residual
+        with torch.no_grad():
+            noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+        # perform guidance
+        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        # compute the previous noisy sample x_t -> x_t-1
+        latents = scheduler.step(noise_pred, t, latents).prev_sample
+    images = decode(vae, latents)
+    ret = [latents, images]
+    return tuple(ret)
+def gligen_enable_fuser(unet, enabled=True):
+    for module in unet.modules():
+        if isinstance(module, GatedSelfAttentionDense):
+            module.enabled = enabled
+@torch.no_grad()
+def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps, bboxes, phrases, num_images_per_prompt=1, gligen_scheduled_sampling_beta: float = 0.3, guidance_scale=7.5,
+    frozen_steps=20, frozen_mask=None,
+    return_saved_cross_attn=False, saved_cross_attn_keys=None, return_cond_ca_only=False, return_token_ca_only=None,
+    offload_cross_attn_to_cpu=False, offload_latents_to_cpu=True,
+    semantic_guidance=False, semantic_guidance_bboxes=None, semantic_guidance_object_positions=None, semantic_guidance_kwargs=None,
+    return_box_vis=False, show_progress=True, save_all_latents=False):
+    """
+    The `bboxes` should be a list, rather than a list of lists (one box per phrase, we can have multiple duplicated phrases).
+    """
+    vae, tokenizer, text_encoder, unet, scheduler, dtype = model_dict.vae, model_dict.tokenizer, model_dict.text_encoder, model_dict.unet, model_dict.scheduler, model_dict.dtype
+    text_embeddings, uncond_embeddings, cond_embeddings = input_embeddings
+    if latents.dim() == 5:
+        # latents_all from the input side, different from the latents_all to be saved
+        latents_all_input = latents
+        latents = latents[0]
+    else:
+        latents_all_input = None
+    # Just in case that we have in-place ops
+    latents = latents.clone()
+    if save_all_latents:
+        # offload to cpu to save space
+        if offload_latents_to_cpu:
+            latents_all = [latents.cpu()]
+        else:
+            latents_all = [latents]
+    scheduler.set_timesteps(num_inference_steps)
+    if frozen_mask is not None:
+        frozen_mask = frozen_mask.to(dtype=dtype).clamp(0., 1.)
+    batch_size = 1
+    # 5.1 Prepare GLIGEN variables
+    assert len(phrases) == len(bboxes)
+    # assert batch_size == 1
+    max_objs = 30
+    _boxes = bboxes
+    n_objs = min(len(_boxes), max_objs)
+    boxes = torch.zeros(max_objs, 4, device=torch_device, dtype=dtype)
+    phrase_embeddings = torch.zeros(max_objs, 768, device=torch_device, dtype=dtype)
+    masks = torch.zeros(max_objs, device=torch_device, dtype=dtype)
+    if n_objs > 0:
+        boxes[:n_objs] = torch.tensor(_boxes[:n_objs])
+        tokenizer_inputs = tokenizer(phrases, padding=True, return_tensors="pt").to(torch_device)
+        _phrase_embeddings = text_encoder(**tokenizer_inputs).pooler_output
+        phrase_embeddings[:n_objs] = _phrase_embeddings[:n_objs]
+        masks[:n_objs] = 1
+    # Classifier-free guidance
+    repeat_batch = batch_size * num_images_per_prompt * 2
+    boxes = boxes.unsqueeze(0).expand(repeat_batch, -1, -1).clone()
+    phrase_embeddings = phrase_embeddings.unsqueeze(0).expand(repeat_batch, -1, -1).clone()
+    masks = masks.unsqueeze(0).expand(repeat_batch, -1).clone()
+    masks[:repeat_batch // 2] = 0
+    if semantic_guidance_bboxes and semantic_guidance:
+        loss = torch.tensor(10000.)
+        # TODO: we can also save necessary tokens only to save memory.
+        # offload_guidance_cross_attn_to_cpu does not save too much since we only store attention map for each timestep.
+        guidance_cross_attention_kwargs = {
+            'offload_cross_attn_to_cpu': False,
+            'enable_flash_attn': False,
+            'gligen': {
+                'boxes': boxes[:repeat_batch // 2],
+                'positive_embeddings': phrase_embeddings[:repeat_batch // 2],
+                'masks': masks[:repeat_batch // 2],
+                'fuser_attn_kwargs': {
+                    'enable_flash_attn': False,
+                }
+            }
+        }
+    if return_saved_cross_attn:
+        saved_attns = []
+    main_cross_attention_kwargs = {
+        'offload_cross_attn_to_cpu': offload_cross_attn_to_cpu,
+        'return_cond_ca_only': return_cond_ca_only,
+        'return_token_ca_only': return_token_ca_only,
+        'save_keys': saved_cross_attn_keys,
+        'gligen': {
+            'boxes': boxes,
+            'positive_embeddings': phrase_embeddings,
+            'masks': masks
+        }
+    }
+    timesteps = scheduler.timesteps
+    num_grounding_steps = int(gligen_scheduled_sampling_beta * len(timesteps))
+    gligen_enable_fuser(unet, True)
+    for index, t in enumerate(tqdm(timesteps, disable=not show_progress)):
+        # Scheduled sampling
+        if index == num_grounding_steps:
+            gligen_enable_fuser(unet, False)
+        if semantic_guidance_bboxes and semantic_guidance:
+            with torch.enable_grad():
+                latents, loss = latent_backward_guidance(scheduler, unet, cond_embeddings, index, semantic_guidance_bboxes, semantic_guidance_object_positions, t, latents, loss, cross_attention_kwargs=guidance_cross_attention_kwargs, **semantic_guidance_kwargs)
+        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+        latent_model_input = torch.cat([latents] * 2)
+        latent_model_input = scheduler.scale_model_input(latent_model_input, timestep=t)
+        main_cross_attention_kwargs['save_attn_to_dict'] = {}
+        # predict the noise residual
+        noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings,
+                            cross_attention_kwargs=main_cross_attention_kwargs).sample
+        if return_saved_cross_attn:
+            saved_attns.append(main_cross_attention_kwargs['save_attn_to_dict'])
+            del main_cross_attention_kwargs['save_attn_to_dict']
+        # perform guidance
+        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        # compute the previous noisy sample x_t -> x_t-1
+        latents = scheduler.step(noise_pred, t, latents).prev_sample
+        if frozen_mask is not None and index < frozen_steps:
+            latents = latents_all_input[index+1] * frozen_mask + latents * (1. - frozen_mask)
+        if save_all_latents:
+            if offload_latents_to_cpu:
+                latents_all.append(latents.cpu())
+            else:
+                latents_all.append(latents)
+    # Turn off fuser for typical SD
+    gligen_enable_fuser(unet, False)
+    images = decode(vae, latents)
+    ret = [latents, images]
+    if return_saved_cross_attn:
+        ret.append(saved_attns)
+    if return_box_vis:
+        pil_images = [utils.draw_box(Image.fromarray(image), bboxes, phrases) for image in images]
+        ret.append(pil_images)
+    if save_all_latents:
+        latents_all = torch.stack(latents_all, dim=0)
+        ret.append(latents_all)
+    return tuple(ret)

models/sam.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import gc
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from models import torch_device
+from transformers import SamModel, SamProcessor
+import utils
+import cv2
+from scipy import ndimage
+def load_sam():
+    sam_model = SamModel.from_pretrained("facebook/sam-vit-base").to(torch_device)
+    sam_processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+    sam_model_dict = dict(
+        sam_model = sam_model, sam_processor = sam_processor
+    )
+    return sam_model_dict
+# Not fully backward compatible with the previous implementation
+# Reference: lmdv2/notebooks/gen_masked_latents_multi_object_ref_ca_loss_modular.ipynb
+def sam(sam_model_dict, image, input_points=None, input_boxes=None, target_mask_shape=None):
+    """target_mask_shape: (h, w)"""
+    sam_model, sam_processor = sam_model_dict['sam_model'], sam_model_dict['sam_processor']
+    with torch.no_grad():
+        with torch.autocast(torch_device):
+            inputs = sam_processor(image, input_points=input_points, input_boxes=input_boxes, return_tensors="pt").to(torch_device)
+            outputs = sam_model(**inputs)
+        masks = sam_processor.image_processor.post_process_masks(
+            outputs.pred_masks.cpu().float(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
+        )
+        conf_scores = outputs.iou_scores.to(device="cpu", dtype=torch.float32).numpy()[0,0]
+        del inputs, outputs
+    gc.collect()
+    if torch_device == "cuda":
+        torch.cuda.empty_cache()
+    masks = masks[0][0].numpy()
+    if target_mask_shape is not None:
+        masks = np.array([cv2.resize(mask.astype(np.uint8) * 255, target_mask_shape[::-1], cv2.INTER_LINEAR).astype(bool) for mask in masks])
+    return masks, conf_scores
+def sam_point_input(sam_model_dict, image, input_points, **kwargs):
+    return sam(sam_model_dict, image, input_points=input_points, **kwargs)
+def sam_box_input(sam_model_dict, image, input_boxes, **kwargs):
+    return sam(sam_model_dict, image, input_boxes=input_boxes, **kwargs)
+def get_iou_with_resize(mask, masks, masks_shape):
+    masks = np.array([cv2.resize(mask.astype(np.uint8) * 255, masks_shape[::-1], cv2.INTER_LINEAR).astype(bool) for mask in masks])
+    return utils.iou(mask, masks)
+def select_mask(masks, conf_scores, coarse_ious=None, rule="largest_over_conf", discourage_mask_below_confidence=0.85, discourage_mask_below_coarse_iou=0.2, verbose=False):
+    """masks: numpy bool array"""
+    mask_sizes = masks.sum(axis=(1, 2))
+    # Another possible rule: iou with the attention mask
+    if rule == "largest_over_conf":
+        # Use the largest segmentation
+        # Discourage selecting masks with conf too low or coarse iou is too low
+        max_mask_size = np.max(mask_sizes)
+        if coarse_ious is not None:
+            scores = mask_sizes - (conf_scores < discourage_mask_below_confidence) * max_mask_size - (coarse_ious < discourage_mask_below_coarse_iou) * max_mask_size
+        else:
+            scores = mask_sizes - (conf_scores < discourage_mask_below_confidence) * max_mask_size
+        if verbose:
+            print(f"mask_sizes: {mask_sizes}, scores: {scores}")
+    else:
+        raise ValueError(f"Unknown rule: {rule}")
+    mask_id = np.argmax(scores)
+    mask = masks[mask_id]
+    selection_conf = conf_scores[mask_id]
+    if coarse_ious is not None:
+        selection_coarse_iou = coarse_ious[mask_id]
+    else:
+        selection_coarse_iou = None
+    if verbose:
+        # print(f"Confidences: {conf_scores}")
+        print(f"Selected a mask with confidence: {selection_conf}, coarse_iou: {selection_coarse_iou}")
+    if verbose:
+        plt.figure(figsize=(10, 8))
+        # plt.suptitle("After SAM")
+        for ind in range(3):
+            plt.subplot(1, 3, ind+1)
+            # This is obtained before resize.
+            plt.title(f"Mask {ind}, score {scores[ind]}, conf {conf_scores[ind]:.2f}, iou {coarse_ious[ind] if coarse_ious is not None else None:.2f}")
+            plt.imshow(masks[ind])
+        plt.tight_layout()
+        plt.show()
+    return mask, selection_conf
+def preprocess_mask(token_attn_np_smooth, mask_th, n_erode_dilate_mask=0):
+    token_attn_np_smooth_normalized = token_attn_np_smooth - token_attn_np_smooth.min()
+    token_attn_np_smooth_normalized /= token_attn_np_smooth_normalized.max()
+    mask_thresholded = token_attn_np_smooth_normalized > mask_th
+    if n_erode_dilate_mask:
+        mask_thresholded = ndimage.binary_erosion(mask_thresholded, iterations=n_erode_dilate_mask)
+        mask_thresholded = ndimage.binary_dilation(mask_thresholded, iterations=n_erode_dilate_mask)
+    return mask_thresholded
+# The overall pipeline to refine the attention mask
+def sam_refine_attn(sam_input_image, token_attn_np, model_dict, height, width, H, W, use_box_input, gaussian_sigma, mask_th_for_box, n_erode_dilate_mask_for_box, mask_th_for_point, discourage_mask_below_confidence, discourage_mask_below_coarse_iou, verbose):
+    # token_attn_np is for visualizations
+    token_attn_np_smooth = ndimage.gaussian_filter(token_attn_np, sigma=gaussian_sigma)
+    # (w, h)
+    mask_size_scale = height // token_attn_np_smooth.shape[1], width // token_attn_np_smooth.shape[0]
+    if use_box_input:
+        # box input
+        mask_binary = preprocess_mask(token_attn_np_smooth, mask_th_for_box, n_erode_dilate_mask=n_erode_dilate_mask_for_box)
+        input_boxes = utils.binary_mask_to_box(mask_binary, w_scale=mask_size_scale[0], h_scale=mask_size_scale[1])
+        input_boxes = [input_boxes]
+        masks, conf_scores = sam_box_input(model_dict, image=sam_input_image, input_boxes=input_boxes, target_mask_shape=(H, W))
+    else:
+        # point input
+        mask_binary = preprocess_mask(token_attn_np_smooth, mask_th_for_point, n_erode_dilate_mask=0)
+        # Uses the max coordinate only
+        max_coord = np.unravel_index(token_attn_np_smooth.argmax(), token_attn_np_smooth.shape)
+        # print("max_coord:", max_coord)
+        input_points = [[[max_coord[1] * mask_size_scale[1], max_coord[0] * mask_size_scale[0]]]]
+        masks, conf_scores = sam_point_input(model_dict, image=sam_input_image, input_points=input_points, target_mask_shape=(H, W))
+    if verbose:
+        plt.title("Coarse binary mask (for box for box input and for iou)")
+        plt.imshow(mask_binary)
+        plt.show()
+    coarse_ious = get_iou_with_resize(mask_binary, masks, masks_shape=mask_binary.shape)
+    mask_selected, conf_score_selected = select_mask(masks, conf_scores, coarse_ious=coarse_ious,
+                                                         rule="largest_over_conf",
+                                                         discourage_mask_below_confidence=discourage_mask_below_confidence,
+                                                         discourage_mask_below_coarse_iou=discourage_mask_below_coarse_iou,
+                                                         verbose=True)
+    return mask_selected, conf_score_selected
+def sam_refine_box(sam_input_image, box, model_dict, height, width, H, W, discourage_mask_below_confidence, discourage_mask_below_coarse_iou, verbose):
+    # (w, h)
+    input_boxes = utils.scale_proportion(box, H=height, W=width)
+    input_boxes = [input_boxes]
+    masks, conf_scores = sam_box_input(model_dict, image=sam_input_image, input_boxes=input_boxes, target_mask_shape=(H, W))
+    mask_binary = utils.proportion_to_mask(box, H, W, return_np=True)
+    if verbose:
+        # Also the box is the input for SAM
+        plt.title("Binary mask from input box (for iou)")
+        plt.imshow(mask_binary)
+        plt.show()
+    coarse_ious = get_iou_with_resize(mask_binary, masks, masks_shape=mask_binary.shape)
+    mask_selected, conf_score_selected = select_mask(masks, conf_scores, coarse_ious=coarse_ious,
+                                                         rule="largest_over_conf",
+                                                         discourage_mask_below_confidence=discourage_mask_below_confidence,
+                                                         discourage_mask_below_coarse_iou=discourage_mask_below_coarse_iou,
+                                                         verbose=True)
+    return mask_selected, conf_score_selected

models/transformer_2d.py ADDED Viewed

	@@ -0,0 +1,367 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.embeddings import ImagePositionalEmbeddings
+from diffusers.utils import BaseOutput, deprecate
+from .attention import BasicTransformerBlock
+from diffusers.models.embeddings import PatchEmbed
+from diffusers.models.modeling_utils import ModelMixin
+@dataclass
+class Transformer2DModelOutput(BaseOutput):
+    """
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            Hidden states conditioned on `encoder_hidden_states` input. If discrete, returns probability distributions
+            for the unnoised latent pixels.
+    """
+    sample: torch.FloatTensor
+class Transformer2DModel(ModelMixin, ConfigMixin):
+    """
+    Transformer model for image-like data. Takes either discrete (classes of vector embeddings) or continuous (actual
+    embeddings) inputs.
+    When input is continuous: First, project the input (aka embedding) and reshape to b, t, d. Then apply standard
+    transformer action. Finally, reshape to image.
+    When input is discrete: First, input (classes of latent pixels) is converted to embeddings and has positional
+    embeddings applied, see `ImagePositionalEmbeddings`. Then apply standard transformer action. Finally, predict
+    classes of unnoised image.
+    Note that it is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised
+    image do not contain a prediction for the masked pixel as the unnoised image cannot be masked.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            Pass if the input is continuous. The number of channels in the input and output.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
+        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+            `ImagePositionalEmbeddings`.
+        num_vector_embeds (`int`, *optional*):
+            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
+            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
+            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
+            up to but not more than steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the TransformerBlocks' attention should contain a bias parameter.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",
+        norm_elementwise_affine: bool = True,
+        use_gated_attention: bool = False,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = (in_channels is not None) and (patch_size is None)
+        self.is_input_vectorized = num_vector_embeds is not None
+        self.is_input_patches = in_channels is not None and patch_size is not None
+        if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
+            deprecation_message = (
+                f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
+                " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
+                " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
+                " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
+                " would be very nice if you could open a Pull request for the `transformer/config.json` file"
+            )
+            deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
+            norm_type = "ada_norm"
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif self.is_input_vectorized and self.is_input_patches:
+            raise ValueError(
+                f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
+                " sure that either `num_vector_embeds` or `num_patches` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
+            raise ValueError(
+                f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
+                f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
+            )
+        # 2. Define input layers
+        if self.is_input_continuous:
+            self.in_channels = in_channels
+            self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+            if use_linear_projection:
+                self.proj_in = nn.Linear(in_channels, inner_dim)
+            else:
+                self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
+            self.height = sample_size
+            self.width = sample_size
+            self.num_vector_embeds = num_vector_embeds
+            self.num_latent_pixels = self.height * self.width
+            self.latent_image_embedding = ImagePositionalEmbeddings(
+                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
+            )
+        elif self.is_input_patches:
+            assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
+            self.height = sample_size
+            self.width = sample_size
+            self.patch_size = patch_size
+            self.pos_embed = PatchEmbed(
+                height=sample_size,
+                width=sample_size,
+                patch_size=patch_size,
+                in_channels=in_channels,
+                embed_dim=inner_dim,
+            )
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    use_gated_attention=use_gated_attention,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        if self.is_input_continuous:
+            # TODO: should use out_channels for continuous projections
+            if use_linear_projection:
+                self.proj_out = nn.Linear(inner_dim, in_channels)
+            else:
+                self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            self.norm_out = nn.LayerNorm(inner_dim)
+            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
+        elif self.is_input_patches:
+            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+            self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
+            self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        return_cross_attention_probs: bool = False,
+    ):
+        """
+        Args:
+            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
+                When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
+                hidden_states
+            encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.LongTensor`, *optional*):
+                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class labels
+                conditioning.
+            encoder_attention_mask ( `torch.Tensor`, *optional* ).
+                Cross-attention mask, applied to encoder_hidden_states. Two formats supported:
+                    Mask `(batch, sequence_length)` True = keep, False = discard. Bias `(batch, 1, sequence_length)` 0
+                    = keep, -10000 = discard.
+                If ndim == 2: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`:
+            [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 1. Input
+        if self.is_input_continuous:
+            batch, _, height, width = hidden_states.shape
+            residual = hidden_states
+            hidden_states = self.norm(hidden_states)
+            if not self.use_linear_projection:
+                hidden_states = self.proj_in(hidden_states)
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+            else:
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+                hidden_states = self.proj_in(hidden_states)
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states)
+        elif self.is_input_patches:
+            hidden_states = self.pos_embed(hidden_states)
+        base_attn_key = cross_attention_kwargs["attn_key"]
+        # 2. Blocks
+        cross_attention_probs_all = []
+        for block_ind, block in enumerate(self.transformer_blocks):
+            cross_attention_kwargs["attn_key"] = base_attn_key + [block_ind]
+            hidden_states = block(
+                hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                class_labels=class_labels,
+                return_cross_attention_probs=return_cross_attention_probs,
+            )
+            if return_cross_attention_probs:
+                hidden_states, cross_attention_probs = hidden_states
+                cross_attention_probs_all.append(cross_attention_probs)
+        # 3. Output
+        if self.is_input_continuous:
+            if not self.use_linear_projection:
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+                hidden_states = self.proj_out(hidden_states)
+            else:
+                hidden_states = self.proj_out(hidden_states)
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+            output = hidden_states + residual
+        elif self.is_input_vectorized:
+            hidden_states = self.norm_out(hidden_states)
+            logits = self.out(hidden_states)
+            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+            logits = logits.permute(0, 2, 1)
+            # log(p(x_0))
+            output = F.log_softmax(logits.double(), dim=1).float()
+        elif self.is_input_patches:
+            # TODO: cleanup!
+            conditioning = self.transformer_blocks[0].norm1.emb(
+                timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+            shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+            hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+            hidden_states = self.proj_out_2(hidden_states)
+            # unpatchify
+            height = width = int(hidden_states.shape[1] ** 0.5)
+            hidden_states = hidden_states.reshape(
+                shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+            )
+            hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+            output = hidden_states.reshape(
+                shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+            )
+        if len(cross_attention_probs_all) == 1:
+            # If we only have one transformer block in a Transformer2DModel, we do not create another nested level.
+            cross_attention_probs_all = cross_attention_probs_all[0]
+        if not return_dict:
+            if return_cross_attention_probs:
+                return (output, cross_attention_probs_all)
+            return (output,)
+        output = Transformer2DModelOutput(sample=output)
+        if return_cross_attention_probs:
+            return output, cross_attention_probs_all
+        return output

models/unet_2d_blocks.py ADDED Viewed

	@@ -0,0 +1,793 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.utils import is_torch_version
+from diffusers.models.dual_transformer_2d import DualTransformer2DModel
+from diffusers.models.resnet import Downsample2D, ResnetBlock2D, Upsample2D
+from .transformer_2d import Transformer2DModel
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    resnet_skip_time_act=False,
+    resnet_out_scale_factor=1.0,
+    cross_attention_norm=None,
+    use_gated_attention=False,
+):
+    down_block_type = down_block_type[7:] if down_block_type.startswith(
+        "UNetRes") else down_block_type
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnDownBlock2D")
+        return CrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            use_gated_attention=use_gated_attention,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    resnet_skip_time_act=False,
+    resnet_out_scale_factor=1.0,
+    cross_attention_norm=None,
+    use_gated_attention=False,
+):
+    up_block_type = up_block_type[7:] if up_block_type.startswith(
+        "UNetRes") else up_block_type
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "CrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError(
+                "cross_attention_dim must be specified for CrossAttnUpBlock2D")
+        return CrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            use_gated_attention=use_gated_attention,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+class UNetMidBlock2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        upcast_attention=False,
+        use_gated_attention=False,
+    ):
+        super().__init__()
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        resnet_groups = resnet_groups if resnet_groups is not None else min(
+            in_channels // 4, 32)
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        for _ in range(num_layers):
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        attn_num_head_channels,
+                        in_channels // attn_num_head_channels,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        use_gated_attention=use_gated_attention,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        attn_num_head_channels,
+                        in_channels // attn_num_head_channels,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        return_cross_attention_probs: bool = False,
+    ) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        cross_attention_probs_all = []
+        base_attn_key = cross_attention_kwargs["attn_key"]
+        for attn_key, (attn, resnet) in enumerate(zip(self.attentions, self.resnets[1:])):
+            cross_attention_kwargs["attn_key"] = base_attn_key + [attn_key]
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                attention_mask=attention_mask,
+                encoder_attention_mask=encoder_attention_mask,
+                return_dict=False,
+                return_cross_attention_probs=return_cross_attention_probs,
+            )
+            if return_cross_attention_probs:
+                hidden_states, cross_attention_probs = hidden_states
+                cross_attention_probs_all.append(cross_attention_probs)
+            else:
+                hidden_states = hidden_states[0]
+            hidden_states = resnet(hidden_states, temb)
+        if return_cross_attention_probs:
+            return hidden_states, cross_attention_probs_all
+        return hidden_states
+class CrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+        use_gated_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        use_gated_attention=use_gated_attention
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        return_cross_attention_probs: bool = False,
+    ):
+        output_states = ()
+        cross_attention_probs_all = []
+        base_attn_key = cross_attention_kwargs["attn_key"]
+        for attn_key, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)):
+            cross_attention_kwargs["attn_key"] = base_attn_key + [attn_key]
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {
+                    "use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                    None,  # timestep
+                    None,  # class_labels
+                    cross_attention_kwargs,
+                    attention_mask,
+                    encoder_attention_mask,
+                    return_cross_attention_probs=return_cross_attention_probs,
+                    **ckpt_kwargs,
+                )
+                if return_cross_attention_probs:
+                    hidden_states, cross_attention_probs = hidden_states
+                    cross_attention_probs_all.append(cross_attention_probs)
+                else:
+                    hidden_states = hidden_states[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                    return_cross_attention_probs=return_cross_attention_probs,
+                )
+                if return_cross_attention_probs:
+                    hidden_states, cross_attention_probs = hidden_states
+                    cross_attention_probs_all.append(cross_attention_probs)
+                else:
+                    hidden_states = hidden_states[0]
+            output_states = output_states + (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states = output_states + (hidden_states,)
+        if return_cross_attention_probs:
+            return hidden_states, output_states, cross_attention_probs_all
+        return hidden_states, output_states
+class DownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, temb=None):
+        output_states = ()
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+            output_states = output_states + (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states = output_states + (hidden_states,)
+        return hidden_states, output_states
+class CrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+        use_gated_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (
+                i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        use_gated_attention=use_gated_attention,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        attn_num_head_channels,
+                        out_channels // attn_num_head_channels,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        return_cross_attention_probs: bool = False,
+    ):
+        cross_attention_probs_all = []
+        base_attn_key = cross_attention_kwargs["attn_key"]
+        for attn_key, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)):
+            cross_attention_kwargs["attn_key"] = base_attn_key + [attn_key]
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat(
+                [hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {
+                    "use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                    None,  # timestep
+                    None,  # class_labels
+                    cross_attention_kwargs,
+                    attention_mask,
+                    encoder_attention_mask,
+                    **ckpt_kwargs,
+                )
+                if return_cross_attention_probs:
+                    hidden_states, cross_attention_probs = hidden_states
+                    cross_attention_probs_all.append(cross_attention_probs)
+                else:
+                    hidden_states = hidden_states[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                    return_cross_attention_probs=return_cross_attention_probs,
+                )
+                if return_cross_attention_probs:
+                    hidden_states, cross_attention_probs = hidden_states
+                    cross_attention_probs_all.append(cross_attention_probs)
+                else:
+                    hidden_states = hidden_states[0]
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        if return_cross_attention_probs:
+            return hidden_states, cross_attention_probs_all
+        return hidden_states
+class UpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (
+                i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat(
+                [hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states

models/unet_2d_condition.py ADDED Viewed

	@@ -0,0 +1,980 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import UNet2DConditionLoadersMixin
+from diffusers.utils import BaseOutput, logging
+from diffusers.models.embeddings import (
+    GaussianFourierProjection,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from .unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    CrossAttnUpBlock2D,
+    DownBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UpBlock2D,
+    get_down_block,
+    get_up_block,
+)
+from .attention_processor import AttentionProcessor, AttnProcessor
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNet2DConditionOutput(BaseOutput):
+    """
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+    sample: torch.FloatTensor
+    cross_attention_probs_down: List[Any]
+    cross_attention_probs_mid: List[Any]
+    cross_attention_probs_up: List[Any]
+class FourierEmbedder(nn.Module):
+    def __init__(self, num_freqs=64, temperature=100):
+        super().__init__()
+        self.num_freqs = num_freqs
+        self.temperature = temperature
+        freq_bands = temperature ** (torch.arange(num_freqs) / num_freqs)
+        freq_bands = freq_bands[None, None, None]
+        self.register_buffer('freq_bands', freq_bands, persistent=False)
+    def __call__(self, x):
+        x = self.freq_bands * x.unsqueeze(-1)
+        return torch.stack((x.sin(), x.cos()), dim=-1).permute(0, 1, 3, 4, 2).reshape(*x.shape[:2], -1)
+class PositionNet(nn.Module):
+    def __init__(self,  positive_len, out_dim, fourier_freqs=8):
+        super().__init__()
+        self.positive_len = positive_len
+        self.out_dim = out_dim
+        self.fourier_embedder = FourierEmbedder(num_freqs=fourier_freqs)
+        self.position_dim = fourier_freqs * 2 * 4 # 2: sin/cos, 4: xyxy
+        self.linears = nn.Sequential(
+            nn.Linear(self.positive_len + self.position_dim, 512),
+            nn.SiLU(),
+            nn.Linear(512, 512),
+            nn.SiLU(),
+            nn.Linear(512, out_dim),
+        )
+        self.null_positive_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+        self.null_position_feature = torch.nn.Parameter(torch.zeros([self.position_dim]))
+    def forward(self, boxes, masks, positive_embeddings):
+        masks = masks.unsqueeze(-1)
+        # embedding position (it may includes padding as placeholder)
+        xyxy_embedding = self.fourier_embedder(boxes) # B*N*4 -> B*N*C
+        # learnable null embedding
+        positive_null = self.null_positive_feature.view(1, 1, -1)
+        xyxy_null = self.null_position_feature.view(1, 1, -1)
+        # replace padding with learnable null embedding
+        positive_embeddings = positive_embeddings * masks + (1 - masks) * positive_null
+        xyxy_embedding = xyxy_embedding * masks + (1 - masks) * xyxy_null
+        objs = self.linears(torch.cat([positive_embeddings, xyxy_embedding], dim=-1))
+        return objs
+class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep
+    and returns sample shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the models (such as downloading or saving, etc.)
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            The mid block type. Choose from `UNetMidBlock2DCrossAttn` or `UNetMidBlock2DSimpleCrossAttn`, will skip the
+            mid block layer if `None`.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
+            The tuple of upsample blocks to use.
+        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
+            Whether to include self-attention in the basic transformer blocks, see
+            [`~models.attention.BasicTransformerBlock`].
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, it will skip the normalization and activation layers in post-processing
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to None):
+            If given, the `encoder_hidden_states` and potentially other embeddings will be down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for resnet blocks, see [`~models.resnet.ResnetBlock2D`]. Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to None):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to None):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        num_class_embeds (`int`, *optional*, defaults to None):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        time_embedding_type (`str`, *optional*, default to `positional`):
+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, default to `None`):
+            An optional override for the dimension of the projected time embedding.
+        time_embedding_act_fn (`str`, *optional*, default to `None`):
+            Optional activation function to use on the time embeddings only one time before they as passed to the rest
+            of the unet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+        timestep_post_act (`str, *optional*, default to `None`):
+            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+        time_cond_proj_dim (`int`, *optional*, default to `None`):
+            The dimension of `cond_proj` layer in timestep embedding.
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
+        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
+        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
+            using the "projection" `class_embed_type`. Required when using the "projection" `class_embed_type`.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+            embeddings with the class embeddings.
+        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
+            Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
+            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is None, the
+            `only_cross_attention` value will be used as the value for `mid_block_only_cross_attention`. Else, it will
+            default to `False`.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: int = 1.0,
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads=64,
+        use_gated_attention: bool = False,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+        # time
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        elif time_embedding_act_fn == "swish":
+            self.time_embed_act = lambda x: F.silu(x)
+        elif time_embedding_act_fn == "mish":
+            self.time_embed_act = nn.Mish()
+        elif time_embedding_act_fn == "silu":
+            self.time_embed_act = nn.SiLU()
+        elif time_embedding_act_fn == "gelu":
+            self.time_embed_act = nn.GELU()
+        else:
+            raise ValueError(f"Unsupported activation function: {time_embedding_act_fn}")
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            if mid_block_only_cross_attention is None:
+                mid_block_only_cross_attention = only_cross_attention
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if mid_block_only_cross_attention is None:
+            mid_block_only_cross_attention = False
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+        else:
+            assert not use_gated_attention, f"use_gated_attention is not supported with varying cross_attention_dim: {cross_attention_dim}"
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                use_gated_attention=use_gated_attention,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                attn_num_head_channels=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                use_gated_attention=use_gated_attention,
+            )
+        elif mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                attn_num_head_channels=reversed_attention_head_dim[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                use_gated_attention=use_gated_attention,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+            if act_fn == "swish":
+                self.conv_act = lambda x: F.silu(x)
+            elif act_fn == "mish":
+                self.conv_act = nn.Mish()
+            elif act_fn == "silu":
+                self.conv_act = nn.SiLU()
+            elif act_fn == "gelu":
+                self.conv_act = nn.GELU()
+            else:
+                raise ValueError(f"Unsupported activation function: {act_fn}")
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+        if use_gated_attention:
+            self.position_net = PositionNet(positive_len=768, out_dim=cross_attention_dim[-1])
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Parameters:
+            `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                of **all** `Attention` layers.
+            In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.:
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        self.set_attn_processor(AttnProcessor())
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+        num_sliceable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        return_cross_attention_probs: bool = False
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+            encoder_attention_mask (`torch.Tensor`):
+                (batch, sequence_length) cross-attention mask, applied to encoder_hidden_states. True = keep, False =
+                discard. Mask will be converted into a bias, which adds large negative values to attention scores
+                corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            added_cond_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified includes additonal conditions that can be used for additonal time
+                embeddings or encoder hidden states projections. See the configurations `encoder_hid_dim_type` and
+                `addition_embed_type` for more information.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+            emb = emb + aug_emb
+        elif self.config.addition_embed_type == "text_image":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+            emb = emb + aug_emb
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        # 2.5 GLIGEN position net
+        if cross_attention_kwargs is not None and cross_attention_kwargs.get('gligen', None) is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            cross_attention_kwargs['gligen'] = {
+                'objs': self.position_net(
+                    boxes=cross_attention_kwargs['gligen']['boxes'],
+                    masks=cross_attention_kwargs['gligen']['masks'],
+                    positive_embeddings=cross_attention_kwargs['gligen']['positive_embeddings']
+                ),
+                'fuser_attn_kwargs': cross_attention_kwargs['gligen'].get('fuser_attn_kwargs', {})
+            }
+        # 3. down
+        down_block_res_samples = (sample,)
+        cross_attention_probs_down = []
+        if cross_attention_kwargs is None:
+            cross_attention_kwargs = {}
+        for i, downsample_block in enumerate(self.down_blocks):
+            cross_attention_kwargs["attn_key"] = ["down", i]
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                downsample_block_output = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_cross_attention_probs=return_cross_attention_probs,
+                )
+                if return_cross_attention_probs:
+                    sample, res_samples, cross_attention_probs = downsample_block_output
+                    cross_attention_probs_down.append(cross_attention_probs)
+                else:
+                    sample, res_samples = downsample_block_output
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            down_block_res_samples += res_samples
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # 4. mid
+        cross_attention_probs_mid = []
+        if self.mid_block is not None:
+            cross_attention_kwargs["attn_key"] = ["mid", 0]
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+                encoder_attention_mask=encoder_attention_mask,
+                return_cross_attention_probs=return_cross_attention_probs,
+            )
+            if return_cross_attention_probs:
+                sample, cross_attention_probs = sample
+                cross_attention_probs_mid.append(cross_attention_probs)
+        if mid_block_additional_residual is not None:
+            sample = sample + mid_block_additional_residual
+        cross_attention_probs_up = []
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            cross_attention_kwargs["attn_key"] = ["up", i]
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_cross_attention_probs=return_cross_attention_probs,
+                )
+                if return_cross_attention_probs:
+                    sample, cross_attention_probs = sample
+                    cross_attention_probs_up.append(cross_attention_probs)
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        return UNet2DConditionOutput(sample=sample, cross_attention_probs_down=cross_attention_probs_down, cross_attention_probs_mid=cross_attention_probs_mid, cross_attention_probs_up=cross_attention_probs_up)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+--extra-index-url https://download.pytorch.org/whl/cu113
+numpy
+scipy
+torch==2.0.0
+diffusers==0.17.0
+transformers==4.29.2
+opencv-python==4.7.0.72
+opencv-contrib-python==4.7.0.72
+inflect==6.0.4
+easydict
+accelerate==0.18.0

shared.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from models import load_sd, sam
+use_fp16 = False
+use_dpm = True
+sd_key = "gligen/diffusers-generation-text-box"
+print(f"Using SD: {sd_key}")
+model_dict = load_sd(key=sd_key, use_fp16=use_fp16, use_dpm_multistep_scheduler=use_dpm, load_inverse_scheduler=False)
+sam_model_dict = sam.load_sam()

utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .utils import *

utils/latents.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import torch
+import numpy as np
+from . import utils
+from utils import torch_device
+import matplotlib.pyplot as plt
+def get_unscaled_latents(batch_size, in_channels, height, width, generator, dtype):
+    """
+    in_channels: often obtained with `unet.config.in_channels`
+    """
+    # Obtain with torch.float32 and cast to float16 if needed
+    # Directly obtaining latents in float16 will lead to different latents
+    latents_base = torch.randn(
+        (batch_size, in_channels, height // 8, width // 8),
+        generator=generator, dtype=dtype
+    ).to(torch_device, dtype=dtype)
+    return latents_base
+def get_scaled_latents(batch_size, in_channels, height, width, generator, dtype, scheduler):
+    latents_base = get_unscaled_latents(batch_size, in_channels, height, width, generator, dtype)
+    latents_base = latents_base * scheduler.init_noise_sigma
+    return latents_base
+def blend_latents(latents_bg, latents_fg, fg_mask, fg_blending_ratio=0.01):
+    """
+    in_channels: often obtained with `unet.config.in_channels`
+    """
+    assert not torch.allclose(latents_bg, latents_fg), "latents_bg should be independent with latents_fg"
+    dtype = latents_bg.dtype
+    latents = latents_bg * (1. - fg_mask) + (latents_bg * np.sqrt(1. - fg_blending_ratio) + latents_fg * np.sqrt(fg_blending_ratio)) * fg_mask
+    latents = latents.to(dtype=dtype)
+    return latents
+@torch.no_grad()
+def compose_latents(model_dict, latents_all_list, mask_tensor_list, num_inference_steps, overall_batch_size, height, width, latents_bg=None, bg_seed=None, compose_box_to_bg=True):
+    unet, scheduler, dtype = model_dict.unet, model_dict.scheduler, model_dict.dtype
+    if latents_bg is None:
+        generator = torch.manual_seed(bg_seed)  # Seed generator to create the inital latent noise
+        latents_bg = get_scaled_latents(overall_batch_size, unet.config.in_channels, height, width, generator, dtype, scheduler)
+    # Other than t=T (idx=0), we only have masked latents. This is to prevent accidentally loading from non-masked part. Use same mask as the one used to compose the latents.
+    composed_latents = torch.zeros((num_inference_steps + 1, *latents_bg.shape), dtype=dtype)
+    composed_latents[0] = latents_bg
+    foreground_indices = torch.zeros(latents_bg.shape[-2:], dtype=torch.long)
+    mask_size = np.array([mask_tensor.sum().item() for mask_tensor in mask_tensor_list])
+    # Compose the largest mask first
+    mask_order = np.argsort(-mask_size)
+    if compose_box_to_bg:
+        # This has two functionalities:
+        # 1. copies the right initial latents from the right place (for centered so generation), 2. copies the right initial latents (since we have foreground blending) for centered/original so generation.
+        for mask_idx in mask_order:
+            latents_all, mask_tensor = latents_all_list[mask_idx], mask_tensor_list[mask_idx]
+            # Note: need to be careful to not copy from zeros due to shifting.
+            mask_tensor = utils.binary_mask_to_box_mask(mask_tensor, to_device=False)
+            mask_tensor_expanded = mask_tensor[None, None, None, ...].to(dtype)
+            composed_latents[0] = composed_latents[0] * (1. - mask_tensor_expanded) + latents_all[0] * mask_tensor_expanded
+    # This is still needed with `compose_box_to_bg` to ensure the foreground latent is still visible and to compute foreground indices.
+    for mask_idx in mask_order:
+        latents_all, mask_tensor = latents_all_list[mask_idx], mask_tensor_list[mask_idx]
+        foreground_indices = foreground_indices * (~mask_tensor) + (mask_idx + 1) * mask_tensor
+        mask_tensor_expanded = mask_tensor[None, None, None, ...].to(dtype)
+        composed_latents = composed_latents * (1. - mask_tensor_expanded) + latents_all * mask_tensor_expanded
+    composed_latents, foreground_indices = composed_latents.to(torch_device), foreground_indices.to(torch_device)
+    return composed_latents, foreground_indices
+def align_with_bboxes(latents_all_list, mask_tensor_list, bboxes, horizontal_shift_only=False):
+    """
+    Each offset in `offset_list` is `(x_offset, y_offset)` (normalized).
+    """
+    new_latents_all_list, new_mask_tensor_list, offset_list = [], [], []
+    for latents_all, mask_tensor, bbox in zip(latents_all_list, mask_tensor_list, bboxes):
+        x_src_center, y_src_center = utils.binary_mask_to_center(mask_tensor, normalize=True)
+        x_min_dest, y_min_dest, x_max_dest, y_max_dest = bbox
+        x_dest_center, y_dest_center = (x_min_dest + x_max_dest) / 2, (y_min_dest + y_max_dest) / 2
+        # print("src (x,y):", x_src_center, y_src_center, "dest (x,y):", x_dest_center, y_dest_center)
+        x_offset, y_offset = x_dest_center - x_src_center, y_dest_center - y_src_center
+        if horizontal_shift_only:
+            y_offset = 0.
+        offset = x_offset, y_offset
+        latents_all = utils.shift_tensor(latents_all, x_offset, y_offset, offset_normalized=True)
+        mask_tensor = utils.shift_tensor(mask_tensor, x_offset, y_offset, offset_normalized=True)
+        new_latents_all_list.append(latents_all)
+        new_mask_tensor_list.append(mask_tensor)
+        offset_list.append(offset)
+    return new_latents_all_list, new_mask_tensor_list, offset_list
+@torch.no_grad()
+def compose_latents_with_alignment(
+    model_dict, latents_all_list, mask_tensor_list, num_inference_steps, overall_batch_size, height, width,
+    align_with_overall_bboxes=True, overall_bboxes=None, horizontal_shift_only=False, **kwargs
+):
+    if align_with_overall_bboxes and len(latents_all_list):
+        expanded_overall_bboxes = utils.expand_overall_bboxes(overall_bboxes)
+        latents_all_list, mask_tensor_list, offset_list = align_with_bboxes(latents_all_list, mask_tensor_list, bboxes=expanded_overall_bboxes, horizontal_shift_only=horizontal_shift_only)
+    else:
+        offset_list = [(0., 0.) for _ in range(len(latents_all_list))]
+    composed_latents, foreground_indices = compose_latents(model_dict, latents_all_list, mask_tensor_list, num_inference_steps, overall_batch_size, height, width, **kwargs)
+    return composed_latents, foreground_indices, offset_list
+def get_input_latents_list(model_dict, bg_seed, fg_seed_start, fg_blending_ratio, height, width, so_prompt_phrase_box_list=None, so_boxes=None, verbose=False):
+    """
+    Note: the returned input latents are scaled by `scheduler.init_noise_sigma`
+    """
+    unet, scheduler, dtype = model_dict.unet, model_dict.scheduler, model_dict.dtype
+    generator_bg = torch.manual_seed(bg_seed)  # Seed generator to create the inital latent noise
+    latents_bg = get_unscaled_latents(batch_size=1, in_channels=unet.config.in_channels, height=height, width=width, generator=generator_bg, dtype=dtype)
+    input_latents_list = []
+    if so_boxes is None:
+        # For compatibility
+        so_boxes = [item[-1] for item in so_prompt_phrase_box_list]
+    # change this changes the foreground initial noise
+    for idx, obj_box in enumerate(so_boxes):
+        H, W = height // 8, width // 8
+        fg_mask = utils.proportion_to_mask(obj_box, H, W)
+        if verbose:
+            plt.imshow(fg_mask.cpu().numpy())
+            plt.show()
+        fg_seed = fg_seed_start + idx
+        if fg_seed == bg_seed:
+            # We should have different seeds for foreground and background
+            fg_seed += 12345
+        generator_fg = torch.manual_seed(fg_seed)
+        latents_fg = get_unscaled_latents(batch_size=1, in_channels=unet.config.in_channels, height=height, width=width, generator=generator_fg, dtype=dtype)
+        input_latents = blend_latents(latents_bg, latents_fg, fg_mask, fg_blending_ratio=fg_blending_ratio)
+        input_latents = input_latents * scheduler.init_noise_sigma
+        input_latents_list.append(input_latents)
+    return input_latents_list, latents_bg

utils/parse.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import ast
+import os
+import json
+from matplotlib.patches import Polygon
+from matplotlib.collections import PatchCollection
+import matplotlib.pyplot as plt
+import numpy as np
+import cv2
+import inflect
+p = inflect.engine()
+img_dir = "imgs"
+bg_prompt_text = "Background prompt: "
+# h, w
+box_scale = (512, 512)
+size = box_scale
+size_h, size_w = size
+print(f"Using box scale: {box_scale}")
+def parse_input(text=None, no_input=False):
+    if not text:
+        if no_input:
+            return
+        text = input("Enter the response: ")
+    if "Objects: " in text:
+        text = text.split("Objects: ")[1]
+    text_split = text.split(bg_prompt_text)
+    if len(text_split) == 2:
+        gen_boxes, bg_prompt = text_split
+    elif len(text_split) == 1:
+        if no_input:
+            return
+        gen_boxes = text
+        bg_prompt = ""
+        while not bg_prompt:
+            # Ignore the empty lines in the response
+            bg_prompt = input("Enter the background prompt: ").strip()
+        if bg_prompt_text in bg_prompt:
+            bg_prompt = bg_prompt.split(bg_prompt_text)[1]
+    else:
+        raise ValueError(f"text: {text}")
+    try:
+        gen_boxes = ast.literal_eval(gen_boxes)
+    except SyntaxError as e:
+        # Sometimes the response is in plain text
+        if "No objects" in gen_boxes:
+            gen_boxes = []
+        else:
+            raise e
+    bg_prompt = bg_prompt.strip()
+    return gen_boxes, bg_prompt
+def filter_boxes(gen_boxes, scale_boxes=True, ignore_background=True, max_scale=3):
+    if len(gen_boxes) == 0:
+        return []
+    box_dict_format = False
+    gen_boxes_new = []
+    for gen_box in gen_boxes:
+        if isinstance(gen_box, dict):
+            name, [bbox_x, bbox_y, bbox_w, bbox_h] = gen_box['name'], gen_box['bounding_box']
+            box_dict_format = True
+        else:
+            name, [bbox_x, bbox_y, bbox_w, bbox_h] = gen_box
+        if bbox_w <= 0 or bbox_h <= 0:
+            # Empty boxes
+            continue
+        if ignore_background:
+            if (bbox_w >= size[1] and bbox_h >= size[0]) or bbox_x > size[1] or bbox_y > size[0]:
+                # Ignore the background boxes
+                continue
+        gen_boxes_new.append(gen_box)
+    gen_boxes = gen_boxes_new
+    if len(gen_boxes) == 0:
+        return []
+    filtered_gen_boxes = []
+    if box_dict_format:
+        # For compatibility
+        bbox_left_x_min = min([gen_box['bounding_box'][0] for gen_box in gen_boxes])
+        bbox_right_x_max = max([gen_box['bounding_box'][0] + gen_box['bounding_box'][2] for gen_box in gen_boxes])
+        bbox_top_y_min = min([gen_box['bounding_box'][1] for gen_box in gen_boxes])
+        bbox_bottom_y_max = max([gen_box['bounding_box'][1] + gen_box['bounding_box'][3] for gen_box in gen_boxes])
+    else:
+        bbox_left_x_min = min([gen_box[1][0] for gen_box in gen_boxes])
+        bbox_right_x_max = max([gen_box[1][0] + gen_box[1][2] for gen_box in gen_boxes])
+        bbox_top_y_min = min([gen_box[1][1] for gen_box in gen_boxes])
+        bbox_bottom_y_max = max([gen_box[1][1] + gen_box[1][3] for gen_box in gen_boxes])
+    # All boxes are empty
+    if (bbox_right_x_max - bbox_left_x_min) == 0:
+        return []
+    # Used if scale_boxes is True
+    shift = -bbox_left_x_min
+    scale = size_w / (bbox_right_x_max - bbox_left_x_min)
+    scale = min(scale, max_scale)
+    for gen_box in gen_boxes:
+        if box_dict_format:
+            name, [bbox_x, bbox_y, bbox_w, bbox_h] = gen_box['name'], gen_box['bounding_box']
+        else:
+            name, [bbox_x, bbox_y, bbox_w, bbox_h] = gen_box
+        if scale_boxes:
+            # Vertical: move the boxes if out of bound
+            # Horizontal: move and scale the boxes so it spans the horizontal line
+            bbox_x = (bbox_x + shift) * scale
+            bbox_y = bbox_y * scale
+            bbox_w, bbox_h = bbox_w * scale, bbox_h * scale
+            # TODO: verify this makes the y center not moving
+            bbox_y_offset = 0
+            if bbox_top_y_min * scale + bbox_y_offset < 0:
+                bbox_y_offset -= bbox_top_y_min * scale
+            if bbox_bottom_y_max * scale + bbox_y_offset >= size_h:
+                bbox_y_offset -= bbox_bottom_y_max * scale - size_h
+            bbox_y += bbox_y_offset
+            if bbox_y < 0:
+                bbox_y, bbox_h = 0, bbox_h - bbox_y
+        name = name.rstrip(".")
+        bounding_box = (int(np.round(bbox_x)), int(np.round(bbox_y)), int(np.round(bbox_w)), int(np.round(bbox_h)))
+        if box_dict_format:
+            gen_box = {
+                'name': name,
+                'bounding_box': bounding_box
+            }
+        else:
+            gen_box = (name, bounding_box)
+        filtered_gen_boxes.append(gen_box)
+    return filtered_gen_boxes
+def draw_boxes(anns):
+    ax = plt.gca()
+    ax.set_autoscale_on(False)
+    polygons = []
+    color = []
+    for ann in anns:
+        c = (np.random.random((1, 3))*0.6+0.4)
+        [bbox_x, bbox_y, bbox_w, bbox_h] = ann['bbox']
+        poly = [[bbox_x, bbox_y], [bbox_x, bbox_y+bbox_h],
+                [bbox_x+bbox_w, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y]]
+        np_poly = np.array(poly).reshape((4, 2))
+        polygons.append(Polygon(np_poly))
+        color.append(c)
+        # print(ann)
+        name = ann['name'] if 'name' in ann else str(ann['category_id'])
+        ax.text(bbox_x, bbox_y, name, style='italic',
+                bbox={'facecolor': 'white', 'alpha': 0.7, 'pad': 5})
+    p = PatchCollection(polygons, facecolor='none',
+                        edgecolors=color, linewidths=2)
+    ax.add_collection(p)
+def show_boxes(gen_boxes, bg_prompt=None, ind=None, show=False):
+    if len(gen_boxes) == 0:
+        return
+    if isinstance(gen_boxes[0], dict):
+        anns = [{'name': gen_box['name'], 'bbox': gen_box['bounding_box']}
+                for gen_box in gen_boxes]
+    else:
+        anns = [{'name': gen_box[0], 'bbox': gen_box[1]} for gen_box in gen_boxes]
+    # White background (to allow line to show on the edge)
+    I = np.ones((size[0]+4, size[1]+4, 3), dtype=np.uint8) * 255
+    plt.imshow(I)
+    plt.axis('off')
+    if bg_prompt is not None:
+        ax = plt.gca()
+        ax.text(0, 0, bg_prompt, style='italic',
+                bbox={'facecolor': 'white', 'alpha': 0.7, 'pad': 5})
+        c = (np.zeros((1, 3)))
+        [bbox_x, bbox_y, bbox_w, bbox_h] = (0, 0, size[1], size[0])
+        poly = [[bbox_x, bbox_y], [bbox_x, bbox_y+bbox_h],
+                [bbox_x+bbox_w, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y]]
+        np_poly = np.array(poly).reshape((4, 2))
+        polygons = [Polygon(np_poly)]
+        color = [c]
+        p = PatchCollection(polygons, facecolor='none',
+                            edgecolors=color, linewidths=2)
+        ax.add_collection(p)
+    draw_boxes(anns)
+    if show:
+        plt.show()
+    else:
+        print("Saved to", f"{img_dir}/boxes.png", f"ind: {ind}")
+        if ind is not None:
+            plt.savefig(f"{img_dir}/boxes_{ind}.png")
+        plt.savefig(f"{img_dir}/boxes.png")
+def show_masks(masks):
+    masks_to_show = np.zeros((*size, 3), dtype=np.float32)
+    for mask in masks:
+        c = (np.random.random((3,))*0.6+0.4)
+        masks_to_show += mask[..., None] * c[None, None, :]
+    plt.imshow(masks_to_show)
+    plt.savefig(f"{img_dir}/masks.png")
+    plt.show()
+    plt.clf()
+def convert_box(box, height, width):
+    # box: x, y, w, h (in 512 format) -> x_min, y_min, x_max, y_max
+    x_min, y_min = box[0] / width, box[1] / height
+    w_box, h_box = box[2] / width, box[3] / height
+    x_max, y_max = x_min + w_box, y_min + h_box
+    return x_min, y_min, x_max, y_max
+def convert_spec(spec, height, width, include_counts=True, verbose=False):
+    # Infer from spec
+    prompt, gen_boxes, bg_prompt = spec['prompt'], spec['gen_boxes'], spec['bg_prompt']
+    # This ensures the same objects appear together because flattened `overall_phrases_bboxes` should EXACTLY correspond to `so_prompt_phrase_box_list`.
+    gen_boxes = sorted(gen_boxes, key=lambda gen_box: gen_box[0])
+    gen_boxes = [(name, convert_box(box, height=height, width=width)) for name, box in gen_boxes]
+    # NOTE: so phrase should include all the words associated to the object (otherwise "an orange dog" may be recognized as "an orange" by the model generating the background).
+    # so word should have one token that includes the word to transfer cross attention (the object name).
+    # Currently using the last word of the object name as word.
+    if bg_prompt:
+        so_prompt_phrase_word_box_list = [(f"{bg_prompt} with {name}", name, name.split(" ")[-1], box) for name, box in gen_boxes]
+    else:
+        so_prompt_phrase_word_box_list = [(f"{name}", name, name.split(" ")[-1], box) for name, box in gen_boxes]
+    objects = [gen_box[0] for gen_box in gen_boxes]
+    objects_unique, objects_count = np.unique(objects, return_counts=True)
+    num_total_matched_boxes = 0
+    overall_phrases_words_bboxes = []
+    for ind, object_name in enumerate(objects_unique):
+        bboxes = [box for name, box in gen_boxes if name == object_name]
+        if objects_count[ind] > 1:
+            phrase = p.plural_noun(object_name.replace("an ", "").replace("a ", ""))
+            if include_counts:
+                phrase = p.number_to_words(objects_count[ind]) + " " + phrase
+        else:
+            phrase = object_name
+        # Currently using the last word of the phrase as word.
+        word = phrase.split(' ')[-1]
+        num_total_matched_boxes += len(bboxes)
+        overall_phrases_words_bboxes.append((phrase, word, bboxes))
+    assert num_total_matched_boxes == len(gen_boxes), f"{num_total_matched_boxes} != {len(gen_boxes)}"
+    objects_str = ", ".join([phrase for phrase, _, _ in overall_phrases_words_bboxes])
+    if objects_str:
+        if bg_prompt:
+            overall_prompt = f"{bg_prompt} with {objects_str}"
+        else:
+            overall_prompt = objects_str
+    else:
+        overall_prompt = bg_prompt
+    if verbose:
+        print("so_prompt_phrase_word_box_list:", so_prompt_phrase_word_box_list)
+        print("overall_prompt:", overall_prompt)
+        print("overall_phrases_words_bboxes:", overall_phrases_words_bboxes)
+    return so_prompt_phrase_word_box_list, overall_prompt, overall_phrases_words_bboxes

utils/utils.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import torch
+from PIL import ImageDraw
+import numpy as np
+import os
+import gc
+torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+def draw_box(pil_img, bboxes, phrases):
+    draw = ImageDraw.Draw(pil_img)
+    # font = ImageFont.truetype('./FreeMono.ttf', 25)
+    for obj_bbox, phrase in zip(bboxes, phrases):
+        x_0, y_0, x_1, y_1 = obj_bbox[0], obj_bbox[1], obj_bbox[2], obj_bbox[3]
+        draw.rectangle([int(x_0 * 512), int(y_0 * 512), int(x_1 * 512), int(y_1 * 512)], outline='red', width=5)
+        draw.text((int(x_0 * 512) + 5, int(y_0 * 512) + 5), phrase, font=None, fill=(255, 0, 0))
+    return pil_img
+def get_centered_box(box, horizontal_center_only=True):
+    x_min, y_min, x_max, y_max = box
+    w = x_max - x_min
+    if horizontal_center_only:
+        return [0.5 - w/2, y_min, 0.5 + w/2, y_max]
+    h = y_max - y_min
+    return [0.5 - w/2, 0.5 - h/2, 0.5 + w/2, 0.5 + h/2]
+# NOTE: this changes the behavior of the function
+def proportion_to_mask(obj_box, H, W, use_legacy=False, return_np=False):
+    x_min, y_min, x_max, y_max = scale_proportion(obj_box, H, W, use_legacy)
+    if return_np:
+        mask = np.zeros((H, W))
+    else:
+        mask = torch.zeros(H, W).to(torch_device)
+    mask[y_min: y_max, x_min: x_max] = 1.
+    return mask
+def scale_proportion(obj_box, H, W, use_legacy=False):
+    if use_legacy:
+        # Bias towards the top-left corner
+        x_min, y_min, x_max, y_max = int(obj_box[0] * W), int(obj_box[1] * H), int(obj_box[2] * W), int(obj_box[3] * H)
+    else:
+        # Separately rounding box_w and box_h to allow shift invariant box sizes. Otherwise box sizes may change when both coordinates being rounded end with ".5".
+        x_min, y_min = round(obj_box[0] * W), round(obj_box[1] * H)
+        box_w, box_h = round((obj_box[2] - obj_box[0]) * W), round((obj_box[3] - obj_box[1]) * H)
+        x_max, y_max = x_min + box_w, y_min + box_h
+        x_min, y_min = max(x_min, 0), max(y_min, 0)
+        x_max, y_max = min(x_max, W), min(y_max, H)
+    return x_min, y_min, x_max, y_max
+def binary_mask_to_box(mask, enlarge_box_by_one=True, w_scale=1, h_scale=1):
+    if isinstance(mask, torch.Tensor):
+        mask_loc = torch.where(mask)
+    else:
+        mask_loc = np.where(mask)
+    height, width = mask.shape
+    if len(mask_loc) == 0:
+        raise ValueError('The mask is empty')
+    if enlarge_box_by_one:
+        ymin, ymax = max(min(mask_loc[0]) - 1, 0), min(max(mask_loc[0]) + 1, height)
+        xmin, xmax = max(min(mask_loc[1]) - 1, 0), min(max(mask_loc[1]) + 1, width)
+    else:
+        ymin, ymax = min(mask_loc[0]), max(mask_loc[0])
+        xmin, xmax = min(mask_loc[1]), max(mask_loc[1])
+    box = [xmin * w_scale, ymin * h_scale, xmax * w_scale, ymax * h_scale]
+    return box
+def binary_mask_to_box_mask(mask, to_device=True):
+    box = binary_mask_to_box(mask)
+    x_min, y_min, x_max, y_max = box
+    H, W = mask.shape
+    mask = torch.zeros(H, W)
+    if to_device:
+        mask = mask.to(torch_device)
+    mask[y_min: y_max+1, x_min: x_max+1] = 1.
+    return mask
+def binary_mask_to_center(mask, normalize=False):
+    """
+    This computes the mass center of the mask.
+    normalize: the coords range from 0 to 1
+    Reference: https://stackoverflow.com/a/66184125
+    """
+    h, w = mask.shape
+    total = mask.sum()
+    if isinstance(mask, torch.Tensor):
+        x_coord = ((mask.sum(dim=0) @ torch.arange(w)) / total).item()
+        y_coord = ((mask.sum(dim=1) @ torch.arange(h)) / total).item()
+    else:
+        x_coord = (mask.sum(axis=0) @ np.arange(w)) / total
+        y_coord = (mask.sum(axis=1) @ np.arange(h)) / total
+    if normalize:
+        x_coord, y_coord = x_coord / w, y_coord / h
+    return x_coord, y_coord
+def iou(mask, masks, eps=1e-6):
+    # mask: [h, w], masks: [n, h, w]
+    mask = mask[None].astype(bool)
+    masks = masks.astype(bool)
+    i = (mask & masks).sum(axis=(1,2))
+    u = (mask | masks).sum(axis=(1,2))
+    return i / (u + eps)
+def free_memory():
+    gc.collect()
+    torch.cuda.empty_cache()
+def expand_overall_bboxes(overall_bboxes):
+    """
+    Expand overall bboxes from a 3d list to 2d list:
+    Input: [[box 1 for phrase 1, box 2 for phrase 1], ...]
+    Output: [box 1, box 2, ...]
+    """
+    return sum(overall_bboxes, start=[])
+def shift_tensor(tensor, x_offset, y_offset, base_w=8, base_h=8, offset_normalized=False, ignore_last_dim=False):
+    """base_w and base_h: make sure the shift is aligned in the latent and multiple levels of cross attention"""
+    if ignore_last_dim:
+        tensor_h, tensor_w = tensor.shape[-3:-1]
+    else:
+        tensor_h, tensor_w = tensor.shape[-2:]
+    if offset_normalized:
+        assert tensor_h % base_h == 0 and tensor_w % base_w == 0, f"{tensor_h, tensor_w} is not a multiple of {base_h, base_w}"
+        scale_from_base_h, scale_from_base_w = tensor_h // base_h, tensor_w // base_w
+        x_offset, y_offset = round(x_offset * base_w) * scale_from_base_w, round(y_offset * base_h) * scale_from_base_h
+    new_tensor = torch.zeros_like(tensor)
+    overlap_w = tensor_w - abs(x_offset)
+    overlap_h = tensor_h - abs(y_offset)
+    if y_offset >= 0:
+        y_src_start = 0
+        y_dest_start = y_offset
+    else:
+        y_src_start = -y_offset
+        y_dest_start = 0
+    if x_offset >= 0:
+        x_src_start = 0
+        x_dest_start = x_offset
+    else:
+        x_src_start = -x_offset
+        x_dest_start = 0
+    if ignore_last_dim:
+        # For cross attention maps, the third to last and the second to last are the 2D dimensions after unflatten.
+        new_tensor[..., y_dest_start:y_dest_start+overlap_h, x_dest_start:x_dest_start+overlap_w, :] = tensor[..., y_src_start:y_src_start+overlap_h, x_src_start:x_src_start+overlap_w, :]
+    else:
+        new_tensor[..., y_dest_start:y_dest_start+overlap_h, x_dest_start:x_dest_start+overlap_w] = tensor[..., y_src_start:y_src_start+overlap_h, x_src_start:x_src_start+overlap_w]
+    return new_tensor