Spaces:

longlian
/

llm-grounded-diffusion

Sleeping

App Files Files Community

Tony Lian commited on Aug 14, 2023

Commit

89f6983

•

1 Parent(s): e32648c

Update: add attention guidance and refactor the code

Browse files

Files changed (13) hide show

app.py +77 -149
examples.py +56 -6
generation.py +412 -130
models/modeling_utils.py +0 -874
models/pipelines.py +352 -2
models/sam.py +4 -2
utils/attn.py +140 -0
utils/boxdiff.py +259 -0
utils/guidance.py +358 -0
utils/latents.py +3 -2
utils/parse.py +93 -18
utils/utils.py +0 -1
utils/vis.py +153 -0

app.py CHANGED Viewed

@@ -1,65 +1,27 @@
 import gradio as gr
 import numpy as np
-import ast
-from matplotlib.patches import Polygon
-from matplotlib.collections import PatchCollection
 import matplotlib.pyplot as plt
-from utils.parse import filter_boxes
 from generation import run as run_ours
 from baseline import run as run_baseline
 import torch
 from shared import DEFAULT_SO_NEGATIVE_PROMPT, DEFAULT_OVERALL_NEGATIVE_PROMPT
-from examples import stage1_examples, stage2_examples
-print(f"Is CUDA available: {torch.cuda.is_available()}")
-if torch.cuda.is_available():
-    print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
-box_scale = (512, 512)
-size = box_scale
-bg_prompt_text = "Background prompt: "
-default_template = """You are an intelligent bounding box generator. I will provide you with a caption for a photo, image, or painting. Your task is to generate the bounding boxes for the objects mentioned in the caption, along with a background prompt describing the scene. The images are of size 512x512, and the bounding boxes should not overlap or go beyond the image boundaries. Each bounding box should be in the format of (object name, [top-left x coordinate, top-left y coordinate, box width, box height]) and include exactly one object. Make the boxes larger if possible. Do not put objects that are already provided in the bounding boxes into the background prompt. If needed, you can make reasonable guesses. Generate the object descriptions and background prompts in English even if the caption might not be in English. Do not include non-existing or excluded objects in the background prompt. Please refer to the example below for the desired format.
-Caption: A realistic image of landscape scene depicting a green car parking on the left of a blue truck, with a red air balloon and a bird in the sky
-Objects: [('a green car', [21, 181, 211, 159]), ('a blue truck', [269, 181, 209, 160]), ('a red air balloon', [66, 8, 145, 135]), ('a bird', [296, 42, 143, 100])]
-Background prompt: A realistic image of a landscape scene
-Caption: A watercolor painting of a wooden table in the living room with an apple on it
-Objects: [('a wooden table', [65, 243, 344, 206]), ('a apple', [206, 306, 81, 69])]
-Background prompt: A watercolor painting of a living room
-Caption: A watercolor painting of two pandas eating bamboo in a forest
-Objects: [('a panda eating bambooo', [30, 171, 212, 226]), ('a panda eating bambooo', [264, 173, 222, 221])]
-Background prompt: A watercolor painting of a forest
-Caption: A realistic image of four skiers standing in a line on the snow near a palm tree
-Objects: [('a skier', [5, 152, 139, 168]), ('a skier', [278, 192, 121, 158]), ('a skier', [148, 173, 124, 155]), ('a palm tree', [404, 180, 103, 180])]
-Background prompt: A realistic image of an outdoor scene with snow
-Caption: An oil painting of a pink dolphin jumping on the left of a steam boat on the sea
-Objects: [('a steam boat', [232, 225, 257, 149]), ('a jumping pink dolphin', [21, 249, 189, 123])]
-Background prompt: An oil painting of the sea
-Caption: A realistic image of a cat playing with a dog in a park with flowers
-Objects: [('a playful cat', [51, 67, 271, 324]), ('a playful dog', [302, 119, 211, 228])]
-Background prompt: A realistic image of a park with flowers
-Caption: 一个客厅场景的油画，墙上挂着电视，电视下面是一个柜子，柜子上有一个花瓶。
-Objects: [('a tv', [88, 85, 335, 203]), ('a cabinet', [57, 308, 404, 201]), ('a flower vase', [166, 222, 92, 108])]
-Background prompt: An oil painting of a living room scene"""
-simplified_prompt = """{template}
-Caption: {prompt}
-Objects: """
-prompt_placeholder = "A realistic photo of a gray cat and an orange dog on the grass."
-layout_placeholder = """Caption: A realistic photo of a gray cat and an orange dog on the grass.
-Objects: [('a gray cat', [67, 243, 120, 126]), ('an orange dog', [265, 193, 190, 210])]
-Background prompt: A realistic photo of a grassy area."""
 def get_lmd_prompt(prompt, template=default_template):
     if prompt == "":
@@ -71,10 +33,10 @@ def get_lmd_prompt(prompt, template=default_template):
 def get_layout_image(response):
     if response == "":
         response = layout_placeholder
-    gen_boxes, bg_prompt = parse_input(response)
     fig = plt.figure(figsize=(8, 8))
     # https://stackoverflow.com/questions/7821518/save-plot-to-numpy-array
-    show_boxes(gen_boxes, bg_prompt)
     # If we haven't already shown or saved the plot, then we need to
     # draw the figure first...
     fig.canvas.draw()
@@ -88,32 +50,41 @@ def get_layout_image(response):
 def get_layout_image_gallery(response):
     return [get_layout_image(response)]
-def get_ours_image(response, overall_prompt_override="", seed=0, num_inference_steps=20, dpm_scheduler=True, use_autocast=False, fg_seed_start=20, fg_blending_ratio=0.1, frozen_step_ratio=0.4, gligen_scheduled_sampling_beta=0.3, so_negative_prompt=DEFAULT_SO_NEGATIVE_PROMPT, overall_negative_prompt=DEFAULT_OVERALL_NEGATIVE_PROMPT, show_so_imgs=False, scale_boxes=False):
     if response == "":
         response = layout_placeholder
-    gen_boxes, bg_prompt = parse_input(response)
     gen_boxes = filter_boxes(gen_boxes, scale_boxes=scale_boxes)
     spec = {
         # prompt is unused
         'prompt': '',
         'gen_boxes': gen_boxes,
-        'bg_prompt': bg_prompt
     }
     if dpm_scheduler:
         scheduler_key = "dpm_scheduler"
     else:
         scheduler_key = "scheduler"
     image_np, so_img_list = run_ours(
         spec, bg_seed=seed, overall_prompt_override=overall_prompt_override, fg_seed_start=fg_seed_start,
         fg_blending_ratio=fg_blending_ratio,frozen_step_ratio=frozen_step_ratio, use_autocast=use_autocast,
-        gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta, num_inference_steps=num_inference_steps, scheduler_key=scheduler_key,
-        so_negative_prompt=so_negative_prompt, overall_negative_prompt=overall_negative_prompt, so_batch_size=2
     )
     images = [image_np]
     if show_so_imgs:
         images.extend([np.asarray(so_img) for so_img in so_img_list])
     return images
 def get_baseline_image(prompt, seed=0):
@@ -126,73 +97,6 @@ def get_baseline_image(prompt, seed=0):
     image_np = run_baseline(prompt, bg_seed=seed, scheduler_key=scheduler_key, num_inference_steps=num_inference_steps)
     return [image_np]
-def parse_input(text=None):
-    try:
-        if "Objects: " in text:
-            text = text.split("Objects: ")[1]
-        text_split = text.split(bg_prompt_text)
-        if len(text_split) == 2:
-            gen_boxes, bg_prompt = text_split
-        gen_boxes = ast.literal_eval(gen_boxes)
-        bg_prompt = bg_prompt.strip()
-    except Exception as e:
-        raise gr.Error(f"response format invalid: {e} (text: {text})")
-    return gen_boxes, bg_prompt
-def draw_boxes(anns):
-    ax = plt.gca()
-    ax.set_autoscale_on(False)
-    polygons = []
-    color = []
-    for ann in anns:
-        c = (np.random.random((1, 3))*0.6+0.4)
-        [bbox_x, bbox_y, bbox_w, bbox_h] = ann['bbox']
-        poly = [[bbox_x, bbox_y], [bbox_x, bbox_y+bbox_h],
-                [bbox_x+bbox_w, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y]]
-        np_poly = np.array(poly).reshape((4, 2))
-        polygons.append(Polygon(np_poly))
-        color.append(c)
-        # print(ann)
-        name = ann['name'] if 'name' in ann else str(ann['category_id'])
-        ax.text(bbox_x, bbox_y, name, style='italic',
-                bbox={'facecolor': 'white', 'alpha': 0.7, 'pad': 5})
-    p = PatchCollection(polygons, facecolor='none',
-                        edgecolors=color, linewidths=2)
-    ax.add_collection(p)
-def show_boxes(gen_boxes, bg_prompt=None):
-    anns = [{'name': gen_box[0], 'bbox': gen_box[1]}
-            for gen_box in gen_boxes]
-    # White background (to allow line to show on the edge)
-    I = np.ones((size[0]+4, size[1]+4, 3), dtype=np.uint8) * 255
-    plt.imshow(I)
-    plt.axis('off')
-    if bg_prompt is not None:
-        ax = plt.gca()
-        ax.text(0, 0, bg_prompt, style='italic',
-                bbox={'facecolor': 'white', 'alpha': 0.7, 'pad': 5})
-        c = np.zeros((1, 3))
-        [bbox_x, bbox_y, bbox_w, bbox_h] = (0, 0, size[1], size[0])
-        poly = [[bbox_x, bbox_y], [bbox_x, bbox_y+bbox_h],
-                [bbox_x+bbox_w, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y]]
-        np_poly = np.array(poly).reshape((4, 2))
-        polygons = [Polygon(np_poly)]
-        color = [c]
-        p = PatchCollection(polygons, facecolor='none',
-                            edgecolors=color, linewidths=2)
-        ax.add_collection(p)
-    draw_boxes(anns)
 duplicate_html = '<a style="display:inline-block" href="https://huggingface.co/spaces/longlian/llm-grounded-diffusion?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a>'
 html = f"""<h1>LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models</h1>
@@ -200,15 +104,28 @@ html = f"""<h1>LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to
             <h2><a href='https://llm-grounded-diffusion.github.io/'>Project Page</a> | <a href='https://bair.berkeley.edu/blog/2023/05/23/lmd/'>5-minute Blog Post</a> | <a href='https://arxiv.org/pdf/2305.13655.pdf'>ArXiv Paper</a> | <a href='https://github.com/TonyLianLong/LLM-groundedDiffusion'>Github</a> | <a href='https://llm-grounded-diffusion.github.io/#citation'>Cite our work</a> if our ideas inspire you.</h2>
             <p><b>Tips:</b><p>
             <p>1. If ChatGPT doesn't generate layout, add/remove the trailing space (added by default) and/or use GPT-4.</p>
-            <p>2. You can perform multi-round specification by giving ChatGPT follow-up requests (e.g., make the object boxes bigger).</p>
-            <p>3. You can also try prompts in Simplified Chinese. If you want to try prompts in another language, translate the first line of last example to your language.</p>
             <p>4. The diffusion model only runs 20 steps by default in this demo. You can make it run more steps to get higher quality images (or tweak frozen steps/guidance steps for better guidance and coherence).</p>
             <p>5. Duplicate this space and add GPU or clone the space and run locally to skip the queue and run our model faster. (<b>Currently we are using a T4 GPU on this space, which is quite slow, and you can add a A10G to make it 5x faster</b>) {duplicate_html}</p>
             <br/>
-            <p>Implementation note: In this demo, we replace the attention manipulation in our layout-guided Stable Diffusion described in our paper with GLIGEN due to much faster inference speed (<b>FlashAttention supported, no backprop needed</b> during inference). Compared to vanilla GLIGEN, we have better coherence. Other parts of text-to-image pipeline, including single object generation and SAM, remain the same. The settings and examples in the prompt are simplified in this demo.</p>
-            <style>.btn {{flex-grow: unset !important;}} </style>
             """
 with gr.Blocks(
     title="LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models"
 ) as g:
@@ -230,42 +147,53 @@ with gr.Blocks(
             inputs=[prompt],
             outputs=[output],
             fn=get_lmd_prompt,
-            cache_examples=True
         )
     with gr.Tab("Stage 2 (New). Layout to Image generation"):
         with gr.Row():
             with gr.Column(scale=1):
-                response = gr.Textbox(lines=8, label="Paste ChatGPT response here (no original caption needed)", placeholder=layout_placeholder)
-                overall_prompt_override = gr.Textbox(lines=2, label="Prompt for overall generation (optional but recommended)", placeholder="You can put your input prompt for layout generation here, helpful if your scene cannot be represented by background prompt and boxes only, e.g., with object interactions. If left empty: background prompt with [objects].", value="")
-                num_inference_steps = gr.Slider(1, 250, value=50, step=1, label="Number of denoising steps (set to >=50 for higher generation quality)")
                 seed = gr.Slider(0, 10000, value=0, step=1, label="Seed")
                 with gr.Accordion("Advanced options (play around for better generation)", open=False):
-                    frozen_step_ratio = gr.Slider(0, 1, value=0.5, step=0.1, label="Foreground frozen steps ratio (higher: preserve object attributes; lower: higher coherence; set to 0: (almost) equivalent to vanilla GLIGEN except details)")
-                    gligen_scheduled_sampling_beta = gr.Slider(0, 1, value=0.4, step=0.1, label="GLIGEN guidance steps ratio (the beta value)")
-                    dpm_scheduler = gr.Checkbox(label="Use DPM scheduler (unchecked: DDIM scheduler, may have better coherence, recommend >=50 inference steps)", show_label=False, value=True)
-                    use_autocast = gr.Checkbox(label="Use FP16 Mixed Precision (faster but with slightly lower quality)", show_label=False, value=True)
-                    fg_seed_start = gr.Slider(0, 10000, value=20, step=1, label="Seed for foreground variation")
-                    fg_blending_ratio = gr.Slider(0, 1, value=0.1, step=0.01, label="Variations added to foreground for single object generation (0: no variation, 1: max variation)")
-                    so_negative_prompt = gr.Textbox(lines=1, label="Negative prompt for single object generation", value=DEFAULT_SO_NEGATIVE_PROMPT)
-                    overall_negative_prompt = gr.Textbox(lines=1, label="Negative prompt for overall generation", value=DEFAULT_OVERALL_NEGATIVE_PROMPT)
-                    show_so_imgs = gr.Checkbox(label="Show annotated single object generations", show_label=False, value=False)
-                    scale_boxes = gr.Checkbox(label="Scale bounding boxes to just fit the scene", show_label=False, value=False)
                 visualize_btn = gr.Button("Visualize Layout", elem_classes="btn")
                 generate_btn = gr.Button("Generate Image from Layout", variant='primary', elem_classes="btn")
             with gr.Column(scale=1):
                 gallery = gr.Gallery(
                     label="Generated image", show_label=False, elem_id="gallery", columns=[1], rows=[1], object_fit="contain", preview=True
                 )
         visualize_btn.click(fn=get_layout_image_gallery, inputs=response, outputs=gallery, api_name="visualize-layout")
-        generate_btn.click(fn=get_ours_image, inputs=[response, overall_prompt_override, seed, num_inference_steps, dpm_scheduler, use_autocast, fg_seed_start, fg_blending_ratio, frozen_step_ratio, gligen_scheduled_sampling_beta, so_negative_prompt, overall_negative_prompt, show_so_imgs, scale_boxes], outputs=gallery, api_name="layout-to-image")
         gr.Examples(
             examples=stage2_examples,
             inputs=[response, overall_prompt_override, seed],
             outputs=[gallery],
             fn=get_ours_image,
-            cache_examples=True
         )
     with gr.Tab("Baseline: Stable Diffusion"):
@@ -274,8 +202,7 @@ with gr.Blocks(
                 sd_prompt = gr.Textbox(lines=2, label="Prompt for baseline SD", placeholder=prompt_placeholder)
                 seed = gr.Slider(0, 10000, value=0, step=1, label="Seed")
                 generate_btn = gr.Button("Generate", elem_classes="btn")
-            # with gr.Column(scale=1):
-            #     output = gr.Image(shape=(512, 512), elem_classes="img", elem_id="img")
             with gr.Column(scale=1):
                 gallery = gr.Gallery(
                     label="Generated image", show_label=False, elem_id="gallery2", columns=[1], rows=[1], object_fit="contain", preview=True
@@ -287,7 +214,8 @@ with gr.Blocks(
             inputs=[sd_prompt],
             outputs=[gallery],
             fn=get_baseline_image,
-            cache_examples=True
         )
 g.launch()

 import gradio as gr
 import numpy as np
+import os
 import matplotlib.pyplot as plt
+from utils.parse import filter_boxes, parse_input_with_negative, show_boxes
 from generation import run as run_ours
 from baseline import run as run_baseline
 import torch
 from shared import DEFAULT_SO_NEGATIVE_PROMPT, DEFAULT_OVERALL_NEGATIVE_PROMPT
+from examples import stage1_examples, stage2_examples, default_template, simplified_prompt, prompt_placeholder, layout_placeholder
+cuda_available = torch.cuda.is_available()
+print(f"Is CUDA available: {torch.cuda.is_available()}")
+if cuda_available:
+    gpu_memory = torch.cuda.get_device_properties(torch.cuda.current_device()).total_memory
+    low_memory = gpu_memory <= 16 * 1024 ** 3
+    print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}. With GPU memory: {gpu_memory}. Low memory: {low_memory}")
+else:
+    low_memory = False
+cache_examples = True
+default_num_inference_steps = 20 if low_memory else 50
 def get_lmd_prompt(prompt, template=default_template):
     if prompt == "":
 def get_layout_image(response):
     if response == "":
         response = layout_placeholder
+    gen_boxes, bg_prompt, neg_prompt = parse_input_with_negative(response, no_input=True)
     fig = plt.figure(figsize=(8, 8))
     # https://stackoverflow.com/questions/7821518/save-plot-to-numpy-array
+    show_boxes(gen_boxes, bg_prompt, neg_prompt)
     # If we haven't already shown or saved the plot, then we need to
     # draw the figure first...
     fig.canvas.draw()
 def get_layout_image_gallery(response):
     return [get_layout_image(response)]
+def get_ours_image(response, overall_prompt_override="", seed=0, num_inference_steps=250, dpm_scheduler=True, use_autocast=False, fg_seed_start=20, fg_blending_ratio=0.1, frozen_step_ratio=0.5, attn_guidance_step_ratio=0.6, gligen_scheduled_sampling_beta=0.4, attn_guidance_scale=20, use_ref_ca=True, so_negative_prompt=DEFAULT_SO_NEGATIVE_PROMPT, overall_negative_prompt=DEFAULT_OVERALL_NEGATIVE_PROMPT, show_so_imgs=False, scale_boxes=False):
     if response == "":
         response = layout_placeholder
+    gen_boxes, bg_prompt, neg_prompt = parse_input_with_negative(response, no_input=True)
     gen_boxes = filter_boxes(gen_boxes, scale_boxes=scale_boxes)
     spec = {
         # prompt is unused
         'prompt': '',
         'gen_boxes': gen_boxes,
+        'bg_prompt': bg_prompt,
+        'extra_neg_prompt': neg_prompt
     }
     if dpm_scheduler:
         scheduler_key = "dpm_scheduler"
     else:
         scheduler_key = "scheduler"
+    overall_max_index_step = int(attn_guidance_step_ratio * num_inference_steps)
     image_np, so_img_list = run_ours(
         spec, bg_seed=seed, overall_prompt_override=overall_prompt_override, fg_seed_start=fg_seed_start,
         fg_blending_ratio=fg_blending_ratio,frozen_step_ratio=frozen_step_ratio, use_autocast=use_autocast,
+        so_gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta, overall_gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta, num_inference_steps=num_inference_steps, scheduler_key=scheduler_key,
+        use_ref_ca=use_ref_ca, so_negative_prompt=so_negative_prompt, overall_negative_prompt=overall_negative_prompt,
+        loss_scale=attn_guidance_scale, max_index_step=0, overall_loss_scale=attn_guidance_scale, overall_max_index_step=overall_max_index_step,
     )
     images = [image_np]
     if show_so_imgs:
         images.extend([np.asarray(so_img) for so_img in so_img_list])
+    if cuda_available:
+        print(f"Max GPU memory allocated: {torch.cuda.max_memory_allocated() / 1024 ** 3:.2f} GB")
+        torch.cuda.reset_max_memory_allocated()
     return images
 def get_baseline_image(prompt, seed=0):
     image_np = run_baseline(prompt, bg_seed=seed, scheduler_key=scheduler_key, num_inference_steps=num_inference_steps)
     return [image_np]
 duplicate_html = '<a style="display:inline-block" href="https://huggingface.co/spaces/longlian/llm-grounded-diffusion?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a>'
 html = f"""<h1>LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models</h1>
             <h2><a href='https://llm-grounded-diffusion.github.io/'>Project Page</a> | <a href='https://bair.berkeley.edu/blog/2023/05/23/lmd/'>5-minute Blog Post</a> | <a href='https://arxiv.org/pdf/2305.13655.pdf'>ArXiv Paper</a> | <a href='https://github.com/TonyLianLong/LLM-groundedDiffusion'>Github</a> | <a href='https://llm-grounded-diffusion.github.io/#citation'>Cite our work</a> if our ideas inspire you.</h2>
             <p><b>Tips:</b><p>
             <p>1. If ChatGPT doesn't generate layout, add/remove the trailing space (added by default) and/or use GPT-4.</p>
+            <p>2. You can perform multi-round specification by giving ChatGPT follow-up requests (e.g., make the objects bigger or move the objects).</p>
+            <p>3. You can also try prompts in Simplified Chinese. You need to leave "prompt for overall image" empty in this case. If you want to try prompts in another language, translate the first line of last example to your language.</p>
             <p>4. The diffusion model only runs 20 steps by default in this demo. You can make it run more steps to get higher quality images (or tweak frozen steps/guidance steps for better guidance and coherence).</p>
             <p>5. Duplicate this space and add GPU or clone the space and run locally to skip the queue and run our model faster. (<b>Currently we are using a T4 GPU on this space, which is quite slow, and you can add a A10G to make it 5x faster</b>) {duplicate_html}</p>
             <br/>
+            <p>Implementation note (updated): In this demo, we provide a few modes: faster generation by disabling attention/per-box guidance. The standard version describes what is implemented for the paper. You can set GLIGEN guidance steps ratio to 0 to disable GLIGEN and use only the original SD weights.</p>
+            <style>.btn {{flex-grow: unset !important;}} </p>
             """
+def preset_change(preset):
+    # frozen_step_ratio, attn_guidance_step_ratio, attn_guidance_scale, use_ref_ca, so_negative_prompt
+    if preset == "Standard":
+        return gr.update(value=0.5, interactive=True), gr.update(value=0.6, interactive=True), gr.update(interactive=True), gr.update(value=True, interactive=True), gr.update(interactive=True)
+    elif preset == "Faster (disable attention guidance)":
+        return gr.update(value=0.5, interactive=True), gr.update(value=0, interactive=False), gr.update(interactive=False), gr.update(value=True, interactive=True), gr.update(interactive=True)
+    elif preset == "Faster (disable per-box guidance)":
+        return gr.update(value=0, interactive=False), gr.update(value=0.6, interactive=True), gr.update(interactive=True), gr.update(value=False, interactive=False), gr.update(interactive=False)
+    elif preset == "Fastest (disable both)":
+        return gr.update(value=0, interactive=False), gr.update(value=0, interactive=False), gr.update(interactive=False), gr.update(value=False, interactive=False), gr.update(interactive=True)
+    else:
+        raise gr.Error(f"Unknown preset {preset}")
 with gr.Blocks(
     title="LLM-grounded Diffusion: Enhancing Prompt Understanding of Text-to-Image Diffusion Models with Large Language Models"
 ) as g:
             inputs=[prompt],
             outputs=[output],
             fn=get_lmd_prompt,
+            cache_examples=cache_examples,
+            label="example_stage1"
         )
     with gr.Tab("Stage 2 (New). Layout to Image generation"):
         with gr.Row():
             with gr.Column(scale=1):
+                overall_prompt_override = gr.Textbox(lines=2, label="Prompt for the overall image (optional but recommended)", placeholder="You can put your input prompt for layout generation here, helpful if your scene cannot be represented by background prompt and boxes only, e.g., with object interactions. If left empty: background prompt with [objects].", value="")
+                response = gr.Textbox(lines=8, label="Paste ChatGPT response here (no original caption needed here)", placeholder=layout_placeholder)
+                num_inference_steps = gr.Slider(1, 100 if low_memory else 250, value=default_num_inference_steps, step=1, label="Number of denoising steps (set to >=50 for higher generation quality)")
+                # Using a environment variable allows setting default to faster/fastest on low-end GPUs.
+                preset = gr.Radio(label="Guidance: apply less control for faster generation", choices=["Standard", "Faster (disable attention guidance)", "Faster (disable per-box guidance)", "Fastest (disable both)"], value="Faster (disable attention guidance)" if low_memory else "Standard")
                 seed = gr.Slider(0, 10000, value=0, step=1, label="Seed")
                 with gr.Accordion("Advanced options (play around for better generation)", open=False):
+                    with gr.Tab("Guidance"):
+                        frozen_step_ratio = gr.Slider(0, 1, value=0.5, step=0.1, label="Foreground frozen steps ratio (higher: stronger attribute binding; lower: higher coherence")
+                        gligen_scheduled_sampling_beta = gr.Slider(0, 1, value=0.4, step=0.1, label="GLIGEN guidance steps ratio (the beta value, higher: stronger GLIGEN guidance)")
+                        attn_guidance_step_ratio = gr.Slider(0, 1, value=0.6, step=0.01, label="Attention guidance steps ratio (higher: stronger attention guidance; lower: faster and higher coherence")
+                        attn_guidance_scale = gr.Slider(0, 50, value=20, step=0.5, label="Attention guidance scale: 0 means no attention guidance.")
+                        use_ref_ca = gr.Checkbox(label="Using per-box attention to guide reference attention", show_label=False, value=True)
+                    with gr.Tab("Generation"):
+                        dpm_scheduler = gr.Checkbox(label="Use DPM scheduler (unchecked: DDIM scheduler, may have better coherence, recommend >=50 inference steps)", show_label=False, value=True)
+                        use_autocast = gr.Checkbox(label="Use FP16 Mixed Precision (faster but with slightly lower quality)" + " [enabled due to low GPU memory]" if low_memory else "", show_label=False, value=True, interactive=not low_memory)
+                        fg_seed_start = gr.Slider(0, 10000, value=20, step=1, label="Seed for foreground variation")
+                        fg_blending_ratio = gr.Slider(0, 1, value=0.1, step=0.01, label="Variations added to foreground for single object generation (0: no variation, 1: max variation)")
+                        scale_boxes = gr.Checkbox(label="Scale bounding boxes to just fit the scene", show_label=False, value=False)
+                        so_negative_prompt = gr.Textbox(lines=1, label="Negative prompt for single object generation", value=DEFAULT_SO_NEGATIVE_PROMPT)
+                        overall_negative_prompt = gr.Textbox(lines=1, label="Negative prompt for overall generation", value=DEFAULT_OVERALL_NEGATIVE_PROMPT)
+                        show_so_imgs = gr.Checkbox(label="Show annotated single object generations", show_label=False, value=False)
                 visualize_btn = gr.Button("Visualize Layout", elem_classes="btn")
                 generate_btn = gr.Button("Generate Image from Layout", variant='primary', elem_classes="btn")
             with gr.Column(scale=1):
                 gallery = gr.Gallery(
                     label="Generated image", show_label=False, elem_id="gallery", columns=[1], rows=[1], object_fit="contain", preview=True
                 )
+        preset.change(preset_change, [preset], [frozen_step_ratio, attn_guidance_step_ratio, attn_guidance_scale, use_ref_ca, so_negative_prompt])
+        prompt.change(None, [prompt], overall_prompt_override, _js="(x) => x")
         visualize_btn.click(fn=get_layout_image_gallery, inputs=response, outputs=gallery, api_name="visualize-layout")
+        generate_btn.click(fn=get_ours_image, inputs=[response, overall_prompt_override, seed, num_inference_steps, dpm_scheduler, use_autocast, fg_seed_start, fg_blending_ratio, frozen_step_ratio, attn_guidance_step_ratio, gligen_scheduled_sampling_beta, attn_guidance_scale, use_ref_ca, so_negative_prompt, overall_negative_prompt, show_so_imgs, scale_boxes], outputs=gallery, api_name="layout-to-image")
         gr.Examples(
             examples=stage2_examples,
             inputs=[response, overall_prompt_override, seed],
             outputs=[gallery],
             fn=get_ours_image,
+            cache_examples=cache_examples,
+            label="example_ours"
         )
     with gr.Tab("Baseline: Stable Diffusion"):
                 sd_prompt = gr.Textbox(lines=2, label="Prompt for baseline SD", placeholder=prompt_placeholder)
                 seed = gr.Slider(0, 10000, value=0, step=1, label="Seed")
                 generate_btn = gr.Button("Generate", elem_classes="btn")
             with gr.Column(scale=1):
                 gallery = gr.Gallery(
                     label="Generated image", show_label=False, elem_id="gallery2", columns=[1], rows=[1], object_fit="contain", preview=True
             inputs=[sd_prompt],
             outputs=[gallery],
             fn=get_baseline_image,
+            cache_examples=cache_examples,
+            label="example_sd"
         )
 g.launch()

examples.py CHANGED Viewed

@@ -1,3 +1,45 @@
 stage1_examples = [
     ["""A realistic photo of a wooden table with an apple on the left and a pear on the right."""],
     ["""A realistic photo of 4 TVs on a wall."""],
@@ -10,25 +52,33 @@ stage1_examples = [
 # Layout, seed
 stage2_examples = [
-    ["""Caption: A realistic photo of a wooden table with an apple on the left and a pear on the right.
 Objects: [('a wooden table', [30, 30, 452, 452]), ('an apple', [52, 223, 50, 60]), ('a pear', [400, 240, 50, 60])]
-Background prompt: A realistic photo""", "", 0],
     ["""Caption: A realistic photo of 4 TVs on a wall.
 Objects: [('a TV', [12, 108, 120, 100]), ('a TV', [132, 112, 120, 100]), ('a TV', [252, 104, 120, 100]), ('a TV', [372, 106, 120, 100])]
-Background prompt: A realistic photo of a wall""", "", 0],
     ["""Caption: A realistic photo of a gray cat and an orange dog on the grass.
 Objects: [('a gray cat', [67, 243, 120, 126]), ('an orange dog', [265, 193, 190, 210])]
-Background prompt: A realistic photo of a grassy area.""", "", 0],
     ["""Caption: 一个室内场景的水彩画，一个桌子上面放着一盘水果
 Objects: [('a table', [81, 242, 350, 210]), ('a plate of fruits', [151, 287, 210, 117])]
 Background prompt: A watercolor painting of an indoor scene""", "", 1],
     ["""Caption: In an empty indoor scene, a blue cube directly above a red cube with a vase on the left of them.
 Objects: [('a blue cube', [232, 116, 76, 76]), ('a red cube', [232, 212, 76, 76]), ('a vase', [100, 198, 62, 144])]
-Background prompt: An empty indoor scene""", "", 2],
     ["""Caption: A realistic photo of a wooden table without bananas in an indoor scene
 Objects: [('a wooden table', [75, 256, 365, 156])]
-Background prompt: A realistic photo of an indoor scene""", "", 3],
     ["""Caption: A realistic photo of two cars on the road.
 Objects: [('a car', [20, 242, 235, 185]), ('a car', [275, 246, 215, 180])]
 Background prompt: A realistic photo of a road.""", "A realistic photo of two cars on the road.", 4],
 ]

+default_template = """You are an intelligent bounding box generator. I will provide you with a caption for a photo, image, or painting. Your task is to generate the bounding boxes for the objects mentioned in the caption, along with a background prompt describing the scene. The images are of size 512x512. The top-left corner has coordinate [0, 0]. The bottom-right corner has coordinnate [512, 512]. The bounding boxes should not overlap or go beyond the image boundaries. Each bounding box should be in the format of (object name, [top-left x coordinate, top-left y coordinate, box width, box height]) and include exactly one object (i.e., start the object name with "a" or "an" if possible). Do not put objects that are already provided in the bounding boxes into the background prompt. Do not include non-existing or excluded objects in the background prompt. If needed, you can make reasonable guesses. Please refer to the example below for the desired format.
+Caption: A realistic image of landscape scene depicting a green car parking on the left of a blue truck, with a red air balloon and a bird in the sky
+Objects: [('a green car', [21, 281, 211, 159]), ('a blue truck', [269, 283, 209, 160]), ('a red air balloon', [66, 8, 145, 135]), ('a bird', [296, 42, 143, 100])]
+Background prompt: A realistic landscape scene
+Negative prompt:
+Caption: A realistic top-down view of a wooden table with two apples on it
+Objects: [('a wooden table', [20, 148, 472, 216]), ('an apple', [150, 226, 100, 100]), ('an apple', [280, 226, 100, 100])]
+Background prompt: A realistic top-down view
+Negative prompt:
+Caption: A realistic scene of three skiers standing in a line on the snow near a palm tree
+Objects: [('a skier', [5, 152, 139, 168]), ('a skier', [278, 192, 121, 158]), ('a skier', [148, 173, 124, 155]), ('a palm tree', [404, 105, 103, 251])]
+Background prompt: A realistic outdoor scene with snow
+Negative prompt:
+Caption: An oil painting of a pink dolphin jumping on the left of a steam boat on the sea
+Objects: [('a steam boat', [232, 225, 257, 149]), ('a jumping pink dolphin', [21, 249, 189, 123])]
+Background prompt: An oil painting of the sea
+Negative prompt:
+Caption: A cute cat and an angry dog without birds
+Objects: [('a cute cat', [51, 67, 271, 324]), ('an angry dog', [302, 119, 211, 228])]
+Background prompt: A realistic scene
+Negative prompt: birds
+Caption: Two pandas in a forest without flowers
+Objects: [('a panda', [30, 171, 212, 226]), ('a panda', [264, 173, 222, 221])]
+Background prompt: A forest
+Negative prompt: flowers
+Caption: 一个客厅场景的油画，墙上挂着一幅画，电视下面是一个柜子，柜子上有一个花瓶，画里没有椅子。
+Objects: [('a painting', [88, 85, 335, 203]), ('a cabinet', [57, 308, 404, 201]), ('a flower vase', [166, 222, 92, 108]), ('a flower vase', [328, 222, 92, 108])]
+Background prompt: An oil painting of a living room scene
+Negative prompt: chairs"""
+simplified_prompt = """{template}
+Caption: {prompt}
+Objects: """
 stage1_examples = [
     ["""A realistic photo of a wooden table with an apple on the left and a pear on the right."""],
     ["""A realistic photo of 4 TVs on a wall."""],
 # Layout, seed
 stage2_examples = [
+    ["""Caption: A realistic top-down view of a wooden table with an apple on the left and a pear on the right.
 Objects: [('a wooden table', [30, 30, 452, 452]), ('an apple', [52, 223, 50, 60]), ('a pear', [400, 240, 50, 60])]
+Background prompt: A realistic top-down view of a room""", "A realistic top-down view of a wooden table with an apple on the left and a pear on the right.", 0],
     ["""Caption: A realistic photo of 4 TVs on a wall.
 Objects: [('a TV', [12, 108, 120, 100]), ('a TV', [132, 112, 120, 100]), ('a TV', [252, 104, 120, 100]), ('a TV', [372, 106, 120, 100])]
+Background prompt: A realistic photo of a wall""", "A realistic photo of 4 TVs on a wall.", 0],
     ["""Caption: A realistic photo of a gray cat and an orange dog on the grass.
 Objects: [('a gray cat', [67, 243, 120, 126]), ('an orange dog', [265, 193, 190, 210])]
+Background prompt: A realistic photo of a grassy area.""", "A realistic photo of a gray cat and an orange dog on the grass.", 0],
     ["""Caption: 一个室内场景的水彩画，一个桌子上面放着一盘水果
 Objects: [('a table', [81, 242, 350, 210]), ('a plate of fruits', [151, 287, 210, 117])]
 Background prompt: A watercolor painting of an indoor scene""", "", 1],
     ["""Caption: In an empty indoor scene, a blue cube directly above a red cube with a vase on the left of them.
 Objects: [('a blue cube', [232, 116, 76, 76]), ('a red cube', [232, 212, 76, 76]), ('a vase', [100, 198, 62, 144])]
+Background prompt: An empty indoor scene""", "In an empty indoor scene, a blue cube directly above a red cube with a vase on the left of them.", 2],
     ["""Caption: A realistic photo of a wooden table without bananas in an indoor scene
 Objects: [('a wooden table', [75, 256, 365, 156])]
+Background prompt: A realistic photo of an indoor scene
+Negative prompt: bananas""", "A realistic photo of a wooden table without bananas in an indoor scene", 3],
     ["""Caption: A realistic photo of two cars on the road.
 Objects: [('a car', [20, 242, 235, 185]), ('a car', [275, 246, 215, 180])]
 Background prompt: A realistic photo of a road.""", "A realistic photo of two cars on the road.", 4],
 ]
+prompt_placeholder = "A realistic photo of a gray cat and an orange dog on the grass."
+layout_placeholder = """Caption: A realistic photo of a gray cat and an orange dog on the grass.
+Objects: [('a gray cat', [67, 243, 120, 126]), ('an orange dog', [265, 193, 190, 210])]
+Background prompt: A realistic photo of a grassy area."""

generation.py CHANGED Viewed

@@ -1,19 +1,24 @@
-version = "v3.0"
 import torch
-import numpy as np
 import models
 import utils
 from models import pipelines, sam
-from utils import parse, latents
-from shared import model_dict, sam_model_dict, DEFAULT_SO_NEGATIVE_PROMPT, DEFAULT_OVERALL_NEGATIVE_PROMPT
-import gc
 verbose = False
-# Accelerates per-box generation
-use_fast_schedule = True
-vae, tokenizer, text_encoder, unet, dtype = model_dict.vae, model_dict.tokenizer, model_dict.text_encoder, model_dict.unet, model_dict.dtype
 model_dict.update(sam_model_dict)
@@ -21,195 +26,472 @@ model_dict.update(sam_model_dict)
 # Hyperparams
 height = 512  # default height of Stable Diffusion
 width = 512  # default width of Stable Diffusion
-H, W = height // 8, width // 8 # size of the latent
 guidance_scale = 7.5  # Scale for classifier-free guidance
 # batch size that is not 1 is not supported
 overall_batch_size = 1
 # discourage masks with confidence below
 discourage_mask_below_confidence = 0.85
 # discourage masks with iou (with coarse binarized attention mask) below
 discourage_mask_below_coarse_iou = 0.25
 run_ind = None
-def generate_single_object_with_box_batch(prompts, bboxes, phrases, words, input_latents_list, input_embeddings,
-                                    sam_refine_kwargs, num_inference_steps, gligen_scheduled_sampling_beta=0.3,
-                                    verbose=False, scheduler_key=None, visualize=True, batch_size=None, **kwargs):
-    # batch_size=None: does not limit the batch size (pass all input together)
-    # prompts and words are not used since we don't have cross-attention control in this function
-    input_latents = torch.cat(input_latents_list, dim=0)
-    # We need to "unsqueeze" to tell that we have only one box and phrase in each batch item
-    bboxes, phrases = [[item] for item in bboxes], [[item] for item in phrases]
-    input_len = len(bboxes)
-    assert len(bboxes) == len(phrases), f"{len(bboxes)} != {len(phrases)}"
-    if batch_size is None:
-        batch_size = input_len
-    run_times = int(np.ceil(input_len / batch_size))
-    mask_selected_list, single_object_pil_images_box_ann, latents_all = [], [], []
-    for batch_idx in range(run_times):
-        input_latents_batch, bboxes_batch, phrases_batch = input_latents[batch_idx * batch_size:(batch_idx + 1) * batch_size], \
-            bboxes[batch_idx * batch_size:(batch_idx + 1) * batch_size], phrases[batch_idx * batch_size:(batch_idx + 1) * batch_size]
-        input_embeddings_batch = input_embeddings[0], input_embeddings[1][batch_idx * batch_size:(batch_idx + 1) * batch_size]
-        _, single_object_images_batch, single_object_pil_images_box_ann_batch, latents_all_batch = pipelines.generate_gligen(
-            model_dict, input_latents_batch, input_embeddings_batch, num_inference_steps, bboxes_batch, phrases_batch, gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
-            guidance_scale=guidance_scale, return_saved_cross_attn=False,
-            return_box_vis=True, save_all_latents=True, batched_condition=True, scheduler_key=scheduler_key, **kwargs
-        )
-        gc.collect()
-        torch.cuda.empty_cache()
-        # `sam_refine_boxes` also calls `empty_cache` so we don't need to explicitly empty the cache again.
-        mask_selected, _ = sam.sam_refine_boxes(sam_input_images=single_object_images_batch, boxes=bboxes_batch, model_dict=model_dict, verbose=verbose, **sam_refine_kwargs)
-        mask_selected_list.append(np.array(mask_selected)[:, 0])
-        single_object_pil_images_box_ann.append(single_object_pil_images_box_ann_batch)
-        latents_all.append(latents_all_batch)
-    single_object_pil_images_box_ann, latents_all = sum(single_object_pil_images_box_ann, []), torch.cat(latents_all, dim=1)
-    # mask_selected_list: List(batch)[List(image)[List(box)[Array of shape (64, 64)]]]
-    mask_selected = np.concatenate(mask_selected_list, axis=0)
-    mask_selected = mask_selected.reshape((-1, *mask_selected.shape[-2:]))
-    assert mask_selected.shape[0] == input_latents.shape[0], f"{mask_selected.shape[0]} != {input_latents.shape[0]}"
-    print(mask_selected.shape)
     mask_selected_tensor = torch.tensor(mask_selected)
-    latents_all = latents_all.transpose(0,1)[:,:,None,...]
-    gc.collect()
-    torch.cuda.empty_cache()
-    return latents_all, mask_selected_tensor, single_object_pil_images_box_ann
-def get_masked_latents_all_list(so_prompt_phrase_word_box_list, input_latents_list, so_input_embeddings, verbose=False, **kwargs):
-    latents_all_list, mask_tensor_list = [], []
     if not so_prompt_phrase_word_box_list:
-        return latents_all_list, mask_tensor_list
-    prompts, bboxes, phrases, words = [], [], [], []
-    for prompt, phrase, word, box in so_prompt_phrase_word_box_list:
-        prompts.append(prompt)
-        bboxes.append(box)
-        phrases.append(phrase)
-        words.append(word)
-    latents_all_list, mask_tensor_list, so_img_list = generate_single_object_with_box_batch(prompts, bboxes, phrases, words, input_latents_list, input_embeddings=so_input_embeddings, verbose=verbose, **kwargs)
-    return latents_all_list, mask_tensor_list, so_img_list
 # Note: need to keep the supervision, especially the box corrdinates, corresponds to each other in single object and overall.
 def run(
-    spec, bg_seed = 1, overall_prompt_override="", fg_seed_start = 20, frozen_step_ratio=0.4, gligen_scheduled_sampling_beta = 0.3, num_inference_steps = 20,
-    so_center_box = False, fg_blending_ratio = 0.1, scheduler_key='dpm_scheduler', so_negative_prompt = DEFAULT_SO_NEGATIVE_PROMPT, overall_negative_prompt = DEFAULT_OVERALL_NEGATIVE_PROMPT, so_horizontal_center_only = True,
-    align_with_overall_bboxes = False, horizontal_shift_only = True, use_autocast = False, so_batch_size = None
 ):
-    """
     so_center_box: using centered box in single object generation
     so_horizontal_center_only: move to the center horizontally only
     align_with_overall_bboxes: Align the center of the mask, latents, and cross-attention with the center of the box in overall bboxes
     horizontal_shift_only: only shift horizontally for the alignment of mask, latents, and cross-attention
     """
-    print("generation:", spec, bg_seed, fg_seed_start, frozen_step_ratio, gligen_scheduled_sampling_beta)
-    frozen_step_ratio = min(max(frozen_step_ratio, 0.), 1.)
     frozen_steps = int(num_inference_steps * frozen_step_ratio)
-    if True:
-        so_prompt_phrase_word_box_list, overall_prompt, overall_phrases_words_bboxes = parse.convert_spec(spec, height, width, verbose=verbose)
     if overall_prompt_override and overall_prompt_override.strip():
         overall_prompt = overall_prompt_override.strip()
-    overall_phrases, overall_words, overall_bboxes = [item[0] for item in overall_phrases_words_bboxes], [item[1] for item in overall_phrases_words_bboxes], [item[2] for item in overall_phrases_words_bboxes]
     # The so box is centered but the overall boxes are not (since we need to place to the right place).
     if so_center_box:
-        so_prompt_phrase_word_box_list = [(prompt, phrase, word, utils.get_centered_box(bbox, horizontal_center_only=so_horizontal_center_only)) for prompt, phrase, word, bbox in so_prompt_phrase_word_box_list]
         if verbose:
-            print(f"centered so_prompt_phrase_word_box_list: {so_prompt_phrase_word_box_list}")
     so_boxes = [item[-1] for item in so_prompt_phrase_word_box_list]
     sam_refine_kwargs = dict(
-        discourage_mask_below_confidence=discourage_mask_below_confidence, discourage_mask_below_coarse_iou=discourage_mask_below_coarse_iou,
-        height=height, width=width, H=H, W=W
     )
     # Note that so and overall use different negative prompts
     with torch.autocast("cuda", enabled=use_autocast):
         so_prompts = [item[0] for item in so_prompt_phrase_word_box_list]
         if so_prompts:
-            so_input_embeddings = models.encode_prompts(prompts=so_prompts, tokenizer=tokenizer, text_encoder=text_encoder, negative_prompt=so_negative_prompt, one_uncond_input_only=True)
         else:
             so_input_embeddings = []
-        overall_input_embeddings = models.encode_prompts(prompts=[overall_prompt], tokenizer=tokenizer, negative_prompt=overall_negative_prompt, text_encoder=text_encoder)
         input_latents_list, latents_bg = latents.get_input_latents_list(
-            model_dict, bg_seed=bg_seed, fg_seed_start=fg_seed_start,
-            so_boxes=so_boxes, fg_blending_ratio=fg_blending_ratio, height=height, width=width, verbose=False
         )
-        latents_all_list, mask_tensor_list, so_img_list = get_masked_latents_all_list(
-            so_prompt_phrase_word_box_list, input_latents_list,
-            gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
-            sam_refine_kwargs=sam_refine_kwargs, so_input_embeddings=so_input_embeddings, num_inference_steps=num_inference_steps, scheduler_key=scheduler_key, verbose=verbose, batch_size=so_batch_size,
-            fast_after_steps=frozen_steps if use_fast_schedule else None, fast_rate=2
         )
-        composed_latents, foreground_indices, offset_list = latents.compose_latents_with_alignment(
-            model_dict, latents_all_list, mask_tensor_list, num_inference_steps,
-            overall_batch_size, height, width, latents_bg=latents_bg,
-            align_with_overall_bboxes=align_with_overall_bboxes, overall_bboxes=overall_bboxes,
-            horizontal_shift_only=horizontal_shift_only, use_fast_schedule=use_fast_schedule, fast_after_steps=frozen_steps
         )
         overall_bboxes_flattened, overall_phrases_flattened = [], []
         for overall_bboxes_item, overall_phrase in zip(overall_bboxes, overall_phrases):
             for overall_bbox in overall_bboxes_item:
                 overall_bboxes_flattened.append(overall_bbox)
                 overall_phrases_flattened.append(overall_phrase)
         # Generate with composed latents
         # Foreground should be frozen
         frozen_mask = foreground_indices != 0
-        regen_latents, images = pipelines.generate_gligen(
-            model_dict, composed_latents, overall_input_embeddings, num_inference_steps,
-            overall_bboxes_flattened, overall_phrases_flattened, guidance_scale=guidance_scale,
-            gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
-            frozen_steps=frozen_steps, frozen_mask=frozen_mask, scheduler_key=scheduler_key
         )
-        print(f"Generation with spatial guidance from input latents and first {frozen_steps} steps frozen (directly from the composed latents input)")
         print("Generation from composed latents (with semantic guidance)")
-        # display(Image.fromarray(images[0]), "img", run_ind)
-    gc.collect()
-    torch.cuda.empty_cache()
-    return images[0], so_img_list

 import torch
 import models
 import utils
 from models import pipelines, sam
+from utils import parse, guidance, attn, latents, vis
+from shared import (
+    model_dict,
+    sam_model_dict,
+    DEFAULT_SO_NEGATIVE_PROMPT,
+    DEFAULT_OVERALL_NEGATIVE_PROMPT,
+)
 verbose = False
+vae, tokenizer, text_encoder, unet, dtype = (
+    model_dict.vae,
+    model_dict.tokenizer,
+    model_dict.text_encoder,
+    model_dict.unet,
+    model_dict.dtype,
+)
 model_dict.update(sam_model_dict)
 # Hyperparams
 height = 512  # default height of Stable Diffusion
 width = 512  # default width of Stable Diffusion
+H, W = height // 8, width // 8  # size of the latent
 guidance_scale = 7.5  # Scale for classifier-free guidance
 # batch size that is not 1 is not supported
 overall_batch_size = 1
+# semantic guidance kwargs (single object)
+guidance_attn_keys = pipelines.DEFAULT_GUIDANCE_ATTN_KEYS
 # discourage masks with confidence below
 discourage_mask_below_confidence = 0.85
 # discourage masks with iou (with coarse binarized attention mask) below
 discourage_mask_below_coarse_iou = 0.25
+# This is controls the foreground variations
+fg_blending_ratio = 0.1
 run_ind = None
+def generate_single_object_with_box(
+    prompt,
+    box,
+    phrase,
+    word,
+    input_latents,
+    input_embeddings,
+    semantic_guidance_kwargs,
+    obj_attn_key,
+    saved_cross_attn_keys,
+    sam_refine_kwargs,
+    num_inference_steps,
+    gligen_scheduled_sampling_beta=0.3,
+    verbose=False,
+    visualize=False,
+    **kwargs,
+):
+    bboxes, phrases, words = [box], [phrase], [word]
+    if verbose:
+        print(f"Getting token map (prompt: {prompt})")
+    object_positions, word_token_indices = guidance.get_phrase_indices(
+        tokenizer=tokenizer,
+        prompt=prompt,
+        phrases=phrases,
+        words=words,
+        return_word_token_indices=True,
+        # Since the prompt for single object is from background prompt + object name, we will not have the case of not found
+        add_suffix_if_not_found=False,
+        verbose=verbose,
+    )
+    # phrases only has one item, so we select the first item in word_token_indices
+    word_token_index = word_token_indices[0]
+    if verbose:
+        print("word_token_index:", word_token_index)
+    # `offload_guidance_cross_attn_to_cpu` will greatly slow down generation
+    (
+        latents,
+        single_object_images,
+        saved_attns,
+        single_object_pil_images_box_ann,
+        latents_all,
+    ) = pipelines.generate_gligen(
+        model_dict,
+        input_latents,
+        input_embeddings,
+        num_inference_steps,
+        bboxes,
+        phrases,
+        gligen_scheduled_sampling_beta=gligen_scheduled_sampling_beta,
+        guidance_scale=guidance_scale,
+        return_saved_cross_attn=True,
+        semantic_guidance=True,
+        semantic_guidance_bboxes=bboxes,
+        semantic_guidance_object_positions=object_positions,
+        semantic_guidance_kwargs=semantic_guidance_kwargs,
+        saved_cross_attn_keys=[obj_attn_key, *saved_cross_attn_keys],
+        return_cond_ca_only=True,
+        return_token_ca_only=word_token_index,
+        offload_cross_attn_to_cpu=False,
+        return_box_vis=True,
+        save_all_latents=True,
+        dynamic_num_inference_steps=True,
+        **kwargs,
+    )
+    # `saved_cross_attn_keys` kwargs may have duplicates
+    utils.free_memory()
+    single_object_pil_image_box_ann = single_object_pil_images_box_ann[0]
+    if visualize:
+        print("Single object image")
+        vis.display(single_object_pil_image_box_ann)
+    mask_selected, conf_score_selected = sam.sam_refine_box(
+        sam_input_image=single_object_images[0],
+        box=box,
+        model_dict=model_dict,
+        verbose=verbose,
+        **sam_refine_kwargs,
+    )
     mask_selected_tensor = torch.tensor(mask_selected)
+    if verbose:
+        vis.visualize(mask_selected, "Mask (selected) after resize")
+        # This is only for visualizations
+        masked_latents = latents_all * mask_selected_tensor[None, None, None, ...]
+        vis.visualize_masked_latents(
+            latents_all, masked_latents, timestep_T=False, timestep_0=True
+        )
+    return (
+        latents_all,
+        mask_selected_tensor,
+        saved_attns,
+        single_object_pil_image_box_ann,
+    )
+def get_masked_latents_all_list(
+    so_prompt_phrase_word_box_list,
+    input_latents_list,
+    so_input_embeddings,
+    verbose=False,
+    **kwargs,
+):
+    latents_all_list, mask_tensor_list, saved_attns_list, so_img_list = [], [], [], []
     if not so_prompt_phrase_word_box_list:
+        return latents_all_list, mask_tensor_list, saved_attns_list
+    so_uncond_embeddings, so_cond_embeddings = so_input_embeddings
+    for idx, ((prompt, phrase, word, box), input_latents) in enumerate(
+        zip(so_prompt_phrase_word_box_list, input_latents_list)
+    ):
+        so_current_cond_embeddings = so_cond_embeddings[idx : idx + 1]
+        so_current_text_embeddings = torch.cat(
+            [so_uncond_embeddings, so_current_cond_embeddings], dim=0
+        )
+        so_current_input_embeddings = (
+            so_current_text_embeddings,
+            so_uncond_embeddings,
+            so_current_cond_embeddings,
+        )
+        latents_all, mask_tensor, saved_attns, so_img = generate_single_object_with_box(
+            prompt,
+            box,
+            phrase,
+            word,
+            input_latents,
+            input_embeddings=so_current_input_embeddings,
+            verbose=verbose,
+            **kwargs,
+        )
+        latents_all_list.append(latents_all)
+        mask_tensor_list.append(mask_tensor)
+        saved_attns_list.append(saved_attns)
+        so_img_list.append(so_img)
+    return latents_all_list, mask_tensor_list, saved_attns_list, so_img_list
 # Note: need to keep the supervision, especially the box corrdinates, corresponds to each other in single object and overall.
 def run(
+    spec,
+    bg_seed=1,
+    overall_prompt_override="",
+    fg_seed_start=20,
+    frozen_step_ratio=0.4,
+    num_inference_steps=20,
+    loss_scale=20,
+    loss_threshold=5.0,
+    max_iter=[2] * 5 + [1] * 10,
+    max_index_step=15,
+    overall_loss_scale=20,
+    overall_loss_threshold=5.0,
+    overall_max_iter=[4] * 5 + [3] * 5 + [2] * 5 + [2] * 5 + [1] * 10,
+    overall_max_index_step=30,
+    so_gligen_scheduled_sampling_beta=0.4,
+    overall_gligen_scheduled_sampling_beta=0.4,
+    ref_ca_loss_weight=0.5,
+    so_center_box=False,
+    fg_blending_ratio=0.1,
+    scheduler_key="dpm_scheduler",
+    so_negative_prompt=DEFAULT_SO_NEGATIVE_PROMPT,
+    overall_negative_prompt=DEFAULT_OVERALL_NEGATIVE_PROMPT,
+    so_horizontal_center_only=True,
+    align_with_overall_bboxes=False,
+    horizontal_shift_only=True,
+    use_fast_schedule=True,
+    # Transfer the cross-attention from single object generation (with ref_ca_saved_attns)
+    # Use reference cross attention to guide the cross attention in the overall generation
+    use_ref_ca=True,
+    use_autocast=False,
 ):
+    """
     so_center_box: using centered box in single object generation
     so_horizontal_center_only: move to the center horizontally only
     align_with_overall_bboxes: Align the center of the mask, latents, and cross-attention with the center of the box in overall bboxes
     horizontal_shift_only: only shift horizontally for the alignment of mask, latents, and cross-attention
     """
+    frozen_step_ratio = min(max(frozen_step_ratio, 0.0), 1.0)
     frozen_steps = int(num_inference_steps * frozen_step_ratio)
+    print(
+        "generation:",
+        spec,
+        bg_seed,
+        fg_seed_start,
+        frozen_step_ratio,
+        so_gligen_scheduled_sampling_beta,
+        overall_gligen_scheduled_sampling_beta,
+        overall_max_index_step,
+    )
+    (
+        so_prompt_phrase_word_box_list,
+        overall_prompt,
+        overall_phrases_words_bboxes,
+    ) = parse.convert_spec(spec, height, width, verbose=verbose)
     if overall_prompt_override and overall_prompt_override.strip():
         overall_prompt = overall_prompt_override.strip()
+    overall_phrases, overall_words, overall_bboxes = (
+        [item[0] for item in overall_phrases_words_bboxes],
+        [item[1] for item in overall_phrases_words_bboxes],
+        [item[2] for item in overall_phrases_words_bboxes],
+    )
     # The so box is centered but the overall boxes are not (since we need to place to the right place).
     if so_center_box:
+        so_prompt_phrase_word_box_list = [
+            (
+                prompt,
+                phrase,
+                word,
+                utils.get_centered_box(
+                    bbox, horizontal_center_only=so_horizontal_center_only
+                ),
+            )
+            for prompt, phrase, word, bbox in so_prompt_phrase_word_box_list
+        ]
         if verbose:
+            print(
+                f"centered so_prompt_phrase_word_box_list: {so_prompt_phrase_word_box_list}"
+            )
     so_boxes = [item[-1] for item in so_prompt_phrase_word_box_list]
+    so_negative_prompt = DEFAULT_SO_NEGATIVE_PROMPT
+    overall_negative_prompt = DEFAULT_OVERALL_NEGATIVE_PROMPT
+    if "extra_neg_prompt" in spec and spec["extra_neg_prompt"]:
+        so_negative_prompt = spec["extra_neg_prompt"] + ", " + so_negative_prompt
+        overall_negative_prompt = (
+            spec["extra_neg_prompt"] + ", " + overall_negative_prompt
+        )
+    semantic_guidance_kwargs = dict(
+        loss_scale=loss_scale,
+        loss_threshold=loss_threshold,
+        max_iter=max_iter,
+        max_index_step=max_index_step,
+        use_ratio_based_loss=False,
+        guidance_attn_keys=guidance_attn_keys,
+        verbose=True,
+    )
     sam_refine_kwargs = dict(
+        discourage_mask_below_confidence=discourage_mask_below_confidence,
+        discourage_mask_below_coarse_iou=discourage_mask_below_coarse_iou,
+        height=height,
+        width=width,
+        H=H,
+        W=W,
     )
+    if verbose:
+        vis.visualize_bboxes(
+            bboxes=[item[-1] for item in so_prompt_phrase_word_box_list], H=H, W=W
+        )
     # Note that so and overall use different negative prompts
     with torch.autocast("cuda", enabled=use_autocast):
         so_prompts = [item[0] for item in so_prompt_phrase_word_box_list]
         if so_prompts:
+            so_input_embeddings = models.encode_prompts(
+                prompts=so_prompts,
+                tokenizer=tokenizer,
+                text_encoder=text_encoder,
+                negative_prompt=so_negative_prompt,
+                one_uncond_input_only=True,
+            )
         else:
             so_input_embeddings = []
         input_latents_list, latents_bg = latents.get_input_latents_list(
+            model_dict,
+            bg_seed=bg_seed,
+            fg_seed_start=fg_seed_start,
+            so_boxes=so_boxes,
+            fg_blending_ratio=fg_blending_ratio,
+            height=height,
+            width=width,
+            verbose=False,
+        )
+        if use_fast_schedule:
+            fast_after_steps = max(frozen_steps, overall_max_index_step) if use_ref_ca else frozen_steps
+        else:
+            fast_after_steps = None
+        if use_ref_ca or frozen_steps > 0:
+            (
+                latents_all_list,
+                mask_tensor_list,
+                saved_attns_list,
+                so_img_list,
+            ) = get_masked_latents_all_list(
+                so_prompt_phrase_word_box_list,
+                input_latents_list,
+                gligen_scheduled_sampling_beta=so_gligen_scheduled_sampling_beta,
+                semantic_guidance_kwargs=semantic_guidance_kwargs,
+                obj_attn_key=("down", 2, 1, 0),
+                saved_cross_attn_keys=guidance_attn_keys if use_ref_ca else [],
+                sam_refine_kwargs=sam_refine_kwargs,
+                so_input_embeddings=so_input_embeddings,
+                num_inference_steps=num_inference_steps,
+                scheduler_key=scheduler_key,
+                verbose=verbose,
+                fast_after_steps=fast_after_steps,
+                fast_rate=2,
+            )
+        else:
+            # No per-box guidance
+            (latents_all_list, mask_tensor_list, saved_attns_list, so_img_list) = [], [], [], []
+        (
+            composed_latents,
+            foreground_indices,
+            offset_list,
+        ) = latents.compose_latents_with_alignment(
+            model_dict,
+            latents_all_list,
+            mask_tensor_list,
+            num_inference_steps,
+            overall_batch_size,
+            height,
+            width,
+            latents_bg=latents_bg,
+            align_with_overall_bboxes=align_with_overall_bboxes,
+            overall_bboxes=overall_bboxes,
+            horizontal_shift_only=horizontal_shift_only,
+            use_fast_schedule=use_fast_schedule,
+            fast_after_steps=fast_after_steps,
         )
+        # NOTE: need to ensure overall embeddings are generated after the update of overall prompt
+        (
+            overall_object_positions,
+            overall_word_token_indices,
+            overall_prompt
+        ) = guidance.get_phrase_indices(
+            tokenizer=tokenizer,
+            prompt=overall_prompt,
+            phrases=overall_phrases,
+            words=overall_words,
+            verbose=verbose,
+            return_word_token_indices=True,
+            add_suffix_if_not_found=True
         )
+        overall_input_embeddings = models.encode_prompts(
+            prompts=[overall_prompt],
+            tokenizer=tokenizer,
+            negative_prompt=overall_negative_prompt,
+            text_encoder=text_encoder,
         )
+        if use_ref_ca:
+            # ref_ca_saved_attns has the same hierarchy as bboxes
+            ref_ca_saved_attns = []
+            flattened_box_idx = 0
+            for bboxes in overall_bboxes:
+                # bboxes: correspond to a phrase
+                ref_ca_current_phrase_saved_attns = []
+                for bbox in bboxes:
+                    # each individual bbox
+                    saved_attns = saved_attns_list[flattened_box_idx]
+                    if align_with_overall_bboxes:
+                        offset = offset_list[flattened_box_idx]
+                        saved_attns = attn.shift_saved_attns(
+                            saved_attns,
+                            offset,
+                            guidance_attn_keys=guidance_attn_keys,
+                            horizontal_shift_only=horizontal_shift_only,
+                        )
+                    ref_ca_current_phrase_saved_attns.append(saved_attns)
+                    flattened_box_idx += 1
+                ref_ca_saved_attns.append(ref_ca_current_phrase_saved_attns)
         overall_bboxes_flattened, overall_phrases_flattened = [], []
         for overall_bboxes_item, overall_phrase in zip(overall_bboxes, overall_phrases):
             for overall_bbox in overall_bboxes_item:
                 overall_bboxes_flattened.append(overall_bbox)
                 overall_phrases_flattened.append(overall_phrase)
+        # This is currently not-shared with the single object one.
+        overall_semantic_guidance_kwargs = dict(
+            loss_scale=overall_loss_scale,
+            loss_threshold=overall_loss_threshold,
+            max_iter=overall_max_iter,
+            max_index_step=overall_max_index_step,
+            # ref_ca comes from the attention map of the word token of the phrase in single object generation, so we apply it only to the word token of the phrase in overall generation.
+            ref_ca_word_token_only=True,
+            # If a word is not provided, we use the last token.
+            ref_ca_last_token_only=True,
+            ref_ca_saved_attns=ref_ca_saved_attns if use_ref_ca else None,
+            word_token_indices=overall_word_token_indices,
+            guidance_attn_keys=guidance_attn_keys,
+            ref_ca_loss_weight=ref_ca_loss_weight,
+            use_ratio_based_loss=False,
+            verbose=True,
+        )
         # Generate with composed latents
         # Foreground should be frozen
         frozen_mask = foreground_indices != 0
+        _, images = pipelines.generate_gligen(
+            model_dict,
+            composed_latents,
+            overall_input_embeddings,
+            num_inference_steps,
+            overall_bboxes_flattened,
+            overall_phrases_flattened,
+            guidance_scale=guidance_scale,
+            gligen_scheduled_sampling_beta=overall_gligen_scheduled_sampling_beta,
+            semantic_guidance=True,
+            semantic_guidance_bboxes=overall_bboxes,
+            semantic_guidance_object_positions=overall_object_positions,
+            semantic_guidance_kwargs=overall_semantic_guidance_kwargs,
+            frozen_steps=frozen_steps,
+            frozen_mask=frozen_mask,
+            scheduler_key=scheduler_key,
         )
+        print(
+            f"Generation with spatial guidance from input latents and first {frozen_steps} steps frozen (directly from the composed latents input)"
+        )
         print("Generation from composed latents (with semantic guidance)")
+    utils.free_memory()
+    return images[0], so_img_list

models/modeling_utils.py DELETED Viewed

@@ -1,874 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import inspect
-import itertools
-import os
-from functools import partial
-from typing import Any, Callable, List, Optional, Tuple, Union
-import torch
-from torch import Tensor, device
-from diffusers import __version__
-from diffusers.utils import (
-    CONFIG_NAME,
-    DIFFUSERS_CACHE,
-    FLAX_WEIGHTS_NAME,
-    HF_HUB_OFFLINE,
-    SAFETENSORS_WEIGHTS_NAME,
-    WEIGHTS_NAME,
-    _add_variant,
-    _get_model_file,
-    deprecate,
-    is_accelerate_available,
-    is_safetensors_available,
-    is_torch_version,
-    logging,
-)
-logger = logging.get_logger(__name__)
-if is_torch_version(">=", "1.9.0"):
-    _LOW_CPU_MEM_USAGE_DEFAULT = True
-else:
-    _LOW_CPU_MEM_USAGE_DEFAULT = False
-if is_accelerate_available():
-    import accelerate
-    from accelerate.utils import set_module_tensor_to_device
-    from accelerate.utils.versions import is_torch_version
-if is_safetensors_available():
-    import safetensors
-def get_parameter_device(parameter: torch.nn.Module):
-    try:
-        parameters_and_buffers = itertools.chain(parameter.parameters(), parameter.buffers())
-        return next(parameters_and_buffers).device
-    except StopIteration:
-        # For torch.nn.DataParallel compatibility in PyTorch 1.5
-        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
-            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
-            return tuples
-        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
-        first_tuple = next(gen)
-        return first_tuple[1].device
-def get_parameter_dtype(parameter: torch.nn.Module):
-    try:
-        params = tuple(parameter.parameters())
-        if len(params) > 0:
-            return params[0].dtype
-        buffers = tuple(parameter.buffers())
-        if len(buffers) > 0:
-            return buffers[0].dtype
-    except StopIteration:
-        # For torch.nn.DataParallel compatibility in PyTorch 1.5
-        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
-            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
-            return tuples
-        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
-        first_tuple = next(gen)
-        return first_tuple[1].dtype
-def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[str] = None):
-    """
-    Reads a checkpoint file, returning properly formatted errors if they arise.
-    """
-    try:
-        if os.path.basename(checkpoint_file) == _add_variant(WEIGHTS_NAME, variant):
-            return torch.load(checkpoint_file, map_location="cpu")
-        else:
-            return safetensors.torch.load_file(checkpoint_file, device="cpu")
-    except Exception as e:
-        try:
-            with open(checkpoint_file) as f:
-                if f.read().startswith("version"):
-                    raise OSError(
-                        "You seem to have cloned a repository without having git-lfs installed. Please install "
-                        "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
-                        "you cloned."
-                    )
-                else:
-                    raise ValueError(
-                        f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained "
-                        "model. Make sure you have saved the model properly."
-                    ) from e
-        except (UnicodeDecodeError, ValueError):
-            raise OSError(
-                f"Unable to load weights from checkpoint file for '{checkpoint_file}' "
-                f"at '{checkpoint_file}'. "
-                "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True."
-            )
-def _load_state_dict_into_model(model_to_load, state_dict):
-    # Convert old format to new format if needed from a PyTorch state_dict
-    # copy state_dict so _load_from_state_dict can modify it
-    state_dict = state_dict.copy()
-    error_msgs = []
-    # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
-    # so we need to apply the function recursively.
-    def load(module: torch.nn.Module, prefix=""):
-        args = (state_dict, prefix, {}, True, [], [], error_msgs)
-        module._load_from_state_dict(*args)
-        for name, child in module._modules.items():
-            if child is not None:
-                load(child, prefix + name + ".")
-    load(model_to_load)
-    return error_msgs
-class ModelMixin(torch.nn.Module):
-    r"""
-    Base class for all models.
-    [`ModelMixin`] takes care of storing the configuration of the models and handles methods for loading, downloading
-    and saving models.
-        - **config_name** ([`str`]) -- A filename under which the model should be stored when calling
-          [`~models.ModelMixin.save_pretrained`].
-    """
-    config_name = CONFIG_NAME
-    _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
-    _supports_gradient_checkpointing = False
-    def __init__(self):
-        super().__init__()
-    def __getattr__(self, name: str) -> Any:
-        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
-        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129 We need to overwrite
-        __getattr__ here in addition so that we don't trigger `torch.nn.Module`'s __getattr__':
-        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
-        """
-        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
-        is_attribute = name in self.__dict__
-        if is_in_config and not is_attribute:
-            deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'unet.config.{name}'."
-            deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False, stacklevel=3)
-            return self._internal_dict[name]
-        # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
-        return super().__getattr__(name)
-    @property
-    def is_gradient_checkpointing(self) -> bool:
-        """
-        Whether gradient checkpointing is activated for this model or not.
-        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
-        activations".
-        """
-        return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules())
-    def enable_gradient_checkpointing(self):
-        """
-        Activates gradient checkpointing for the current model.
-        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
-        activations".
-        """
-        if not self._supports_gradient_checkpointing:
-            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
-        self.apply(partial(self._set_gradient_checkpointing, value=True))
-    def disable_gradient_checkpointing(self):
-        """
-        Deactivates gradient checkpointing for the current model.
-        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
-        activations".
-        """
-        if self._supports_gradient_checkpointing:
-            self.apply(partial(self._set_gradient_checkpointing, value=False))
-    def set_use_memory_efficient_attention_xformers(
-        self, valid: bool, attention_op: Optional[Callable] = None
-    ) -> None:
-        # Recursively walk through all the children.
-        # Any children which exposes the set_use_memory_efficient_attention_xformers method
-        # gets the message
-        def fn_recursive_set_mem_eff(module: torch.nn.Module):
-            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
-                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
-            for child in module.children():
-                fn_recursive_set_mem_eff(child)
-        for module in self.children():
-            if isinstance(module, torch.nn.Module):
-                fn_recursive_set_mem_eff(module)
-    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
-        r"""
-        Enable memory efficient attention as implemented in xformers.
-        When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference
-        time. Speed up at training time is not guaranteed.
-        Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention
-        is used.
-        Parameters:
-            attention_op (`Callable`, *optional*):
-                Override the default `None` operator for use as `op` argument to the
-                [`memory_efficient_attention()`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention)
-                function of xFormers.
-        Examples:
-        ```py
-        >>> import torch
-        >>> from diffusers import UNet2DConditionModel
-        >>> from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
-        >>> model = UNet2DConditionModel.from_pretrained(
-        ...     "stabilityai/stable-diffusion-2-1", subfolder="unet", torch_dtype=torch.float16
-        ... )
-        >>> model = model.to("cuda")
-        >>> model.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
-        ```
-        """
-        self.set_use_memory_efficient_attention_xformers(True, attention_op)
-    def disable_xformers_memory_efficient_attention(self):
-        r"""
-        Disable memory efficient attention as implemented in xformers.
-        """
-        self.set_use_memory_efficient_attention_xformers(False)
-    def save_pretrained(
-        self,
-        save_directory: Union[str, os.PathLike],
-        is_main_process: bool = True,
-        save_function: Callable = None,
-        safe_serialization: bool = False,
-        variant: Optional[str] = None,
-    ):
-        """
-        Save a model and its configuration file to a directory, so that it can be re-loaded using the
-        `[`~models.ModelMixin.from_pretrained`]` class method.
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to which to save. Will be created if it doesn't exist.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful when in distributed training like
-                TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
-                the main process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
-                need to replace `torch.save` by another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            safe_serialization (`bool`, *optional*, defaults to `False`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
-            variant (`str`, *optional*):
-                If specified, weights are saved in the format pytorch_model.<variant>.bin.
-        """
-        if safe_serialization and not is_safetensors_available():
-            raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.")
-        if os.path.isfile(save_directory):
-            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
-            return
-        os.makedirs(save_directory, exist_ok=True)
-        model_to_save = self
-        # Attach architecture to the config
-        # Save the config
-        if is_main_process:
-            model_to_save.save_config(save_directory)
-        # Save the model
-        state_dict = model_to_save.state_dict()
-        weights_name = SAFETENSORS_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
-        weights_name = _add_variant(weights_name, variant)
-        # Save the model
-        if safe_serialization:
-            safetensors.torch.save_file(
-                state_dict, os.path.join(save_directory, weights_name), metadata={"format": "pt"}
-            )
-        else:
-            torch.save(state_dict, os.path.join(save_directory, weights_name))
-        logger.info(f"Model weights saved in {os.path.join(save_directory, weights_name)}")
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
-        r"""
-        Instantiate a pretrained pytorch model from a pre-trained model configuration.
-        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
-        the model, you should first set it back in training mode with `model.train()`.
-        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
-        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
-        task.
-        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
-        weights are discarded.
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids should have an organization name, like `google/ddpm-celebahq-256`.
-                    - A path to a *directory* containing model weights saved using [`~ModelMixin.save_config`], e.g.,
-                      `./my_model_directory/`.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            torch_dtype (`str` or `torch.dtype`, *optional*):
-                Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
-                will be automatically derived from the model's weights.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(`bool`, *optional*, defaults to `False`):
-                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `diffusers-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-            from_flax (`bool`, *optional*, defaults to `False`):
-                Load the model weights from a Flax checkpoint save file.
-            subfolder (`str`, *optional*, defaults to `""`):
-                In case the relevant files are located inside a subfolder of the model repo (either remote in
-                huggingface.co or downloaded locally), you can specify the folder name here.
-            mirror (`str`, *optional*):
-                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
-                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
-                Please refer to the mirror site for more information.
-            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
-                A map that specifies where each submodule should go. It doesn't need to be refined to each
-                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
-                same device.
-                To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
-                more information about each option see [designing a device
-                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
-            max_memory (`Dict`, *optional*):
-                A dictionary device identifier to maximum memory. Will default to the maximum memory available for each
-                GPU and the available CPU RAM if unset.
-            offload_folder (`str` or `os.PathLike`, *optional*):
-                If the `device_map` contains any value `"disk"`, the folder where we will offload weights.
-            offload_state_dict (`bool`, *optional*):
-                If `True`, will temporarily offload the CPU state dict to the hard drive to avoid getting out of CPU
-                RAM if the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to
-                `True` when there is some disk offload.
-            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
-                Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
-                also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
-                model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
-                setting this argument to `True` will raise an error.
-            variant (`str`, *optional*):
-                If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin. `variant` is
-                ignored when using `from_flax`.
-            use_safetensors (`bool`, *optional*, defaults to `None`):
-                If set to `None`, the `safetensors` weights will be downloaded if they're available **and** if the
-                `safetensors` library is installed. If set to `True`, the model will be forcibly loaded from
-                `safetensors` weights. If set to `False`, loading will *not* use `safetensors`.
-        <Tip>
-         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
-         models](https://huggingface.co/docs/hub/models-gated#gated-models).
-        </Tip>
-        <Tip>
-        Activate the special ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use
-        this method in a firewalled environment.
-        </Tip>
-        """
-        cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
-        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
-        force_download = kwargs.pop("force_download", False)
-        from_flax = kwargs.pop("from_flax", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        output_loading_info = kwargs.pop("output_loading_info", False)
-        local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE)
-        use_auth_token = kwargs.pop("use_auth_token", None)
-        revision = kwargs.pop("revision", None)
-        torch_dtype = kwargs.pop("torch_dtype", None)
-        subfolder = kwargs.pop("subfolder", None)
-        device_map = kwargs.pop("device_map", None)
-        max_memory = kwargs.pop("max_memory", None)
-        offload_folder = kwargs.pop("offload_folder", None)
-        offload_state_dict = kwargs.pop("offload_state_dict", False)
-        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
-        variant = kwargs.pop("variant", None)
-        use_safetensors = kwargs.pop("use_safetensors", None)
-        if use_safetensors and not is_safetensors_available():
-            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
-            )
-        allow_pickle = False
-        if use_safetensors is None:
-            use_safetensors = is_safetensors_available()
-            allow_pickle = True
-        if low_cpu_mem_usage and not is_accelerate_available():
-            low_cpu_mem_usage = False
-            logger.warning(
-                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
-                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
-                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
-                " install accelerate\n```\n."
-            )
-        if device_map is not None and not is_accelerate_available():
-            raise NotImplementedError(
-                "Loading and dispatching requires `accelerate`. Please make sure to install accelerate or set"
-                " `device_map=None`. You can install accelerate with `pip install accelerate`."
-            )
-        # Check if we can handle device_map and dispatching the weights
-        if device_map is not None and not is_torch_version(">=", "1.9.0"):
-            raise NotImplementedError(
-                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
-                " `device_map=None`."
-            )
-        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
-            raise NotImplementedError(
-                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
-                " `low_cpu_mem_usage=False`."
-            )
-        if low_cpu_mem_usage is False and device_map is not None:
-            raise ValueError(
-                f"You cannot set `low_cpu_mem_usage` to `False` while using device_map={device_map} for loading and"
-                " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
-            )
-        # Load config if we don't provide a configuration
-        config_path = pretrained_model_name_or_path
-        user_agent = {
-            "diffusers": __version__,
-            "file_type": "model",
-            "framework": "pytorch",
-        }
-        # load config
-        config, unused_kwargs, commit_hash = cls.load_config(
-            config_path,
-            cache_dir=cache_dir,
-            return_unused_kwargs=True,
-            return_commit_hash=True,
-            force_download=force_download,
-            resume_download=resume_download,
-            proxies=proxies,
-            local_files_only=local_files_only,
-            use_auth_token=use_auth_token,
-            revision=revision,
-            subfolder=subfolder,
-            device_map=device_map,
-            max_memory=max_memory,
-            offload_folder=offload_folder,
-            offload_state_dict=offload_state_dict,
-            user_agent=user_agent,
-            **kwargs,
-        )
-        # load model
-        model_file = None
-        if from_flax:
-            model_file = _get_model_file(
-                pretrained_model_name_or_path,
-                weights_name=FLAX_WEIGHTS_NAME,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                subfolder=subfolder,
-                user_agent=user_agent,
-                commit_hash=commit_hash,
-            )
-            model = cls.from_config(config, **unused_kwargs)
-            # Convert the weights
-            from diffusers.models.modeling_pytorch_flax_utils import load_flax_checkpoint_in_pytorch_model
-            model = load_flax_checkpoint_in_pytorch_model(model, model_file)
-        else:
-            if use_safetensors:
-                try:
-                    model_file = _get_model_file(
-                        pretrained_model_name_or_path,
-                        weights_name=_add_variant(SAFETENSORS_WEIGHTS_NAME, variant),
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        resume_download=resume_download,
-                        proxies=proxies,
-                        local_files_only=local_files_only,
-                        use_auth_token=use_auth_token,
-                        revision=revision,
-                        subfolder=subfolder,
-                        user_agent=user_agent,
-                        commit_hash=commit_hash,
-                    )
-                except IOError as e:
-                    if not allow_pickle:
-                        raise e
-                    pass
-            if model_file is None:
-                model_file = _get_model_file(
-                    pretrained_model_name_or_path,
-                    weights_name=_add_variant(WEIGHTS_NAME, variant),
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    resume_download=resume_download,
-                    proxies=proxies,
-                    local_files_only=local_files_only,
-                    use_auth_token=use_auth_token,
-                    revision=revision,
-                    subfolder=subfolder,
-                    user_agent=user_agent,
-                    commit_hash=commit_hash,
-                )
-            if low_cpu_mem_usage:
-                # Instantiate model with empty weights
-                with accelerate.init_empty_weights():
-                    model = cls.from_config(config, **unused_kwargs)
-                # if device_map is None, load the state dict and move the params from meta device to the cpu
-                if device_map is None:
-                    param_device = "cpu"
-                    state_dict = load_state_dict(model_file, variant=variant)
-                    model._convert_deprecated_attention_blocks(state_dict)
-                    # move the params from meta device to cpu
-                    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
-                    if len(missing_keys) > 0:
-                        raise ValueError(
-                            f"Cannot load {cls} from {pretrained_model_name_or_path} because the following keys are"
-                            f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass"
-                            " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomly initialize"
-                            " those weights or else make sure your checkpoint file is correct."
-                        )
-                    empty_state_dict = model.state_dict()
-                    for param_name, param in state_dict.items():
-                        accepts_dtype = "dtype" in set(
-                            inspect.signature(set_module_tensor_to_device).parameters.keys()
-                        )
-                        if empty_state_dict[param_name].shape != param.shape:
-                            raise ValueError(
-                                f"Cannot load {pretrained_model_name_or_path} because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
-                            )
-                        if accepts_dtype:
-                            set_module_tensor_to_device(
-                                model, param_name, param_device, value=param, dtype=torch_dtype
-                            )
-                        else:
-                            set_module_tensor_to_device(model, param_name, param_device, value=param)
-                else:  # else let accelerate handle loading and dispatching.
-                    # Load weights and dispatch according to the device_map
-                    # by default the device_map is None and the weights are loaded on the CPU
-                    accelerate.load_checkpoint_and_dispatch(
-                        model,
-                        model_file,
-                        device_map,
-                        max_memory=max_memory,
-                        offload_folder=offload_folder,
-                        offload_state_dict=offload_state_dict,
-                        dtype=torch_dtype,
-                    )
-                loading_info = {
-                    "missing_keys": [],
-                    "unexpected_keys": [],
-                    "mismatched_keys": [],
-                    "error_msgs": [],
-                }
-            else:
-                model = cls.from_config(config, **unused_kwargs)
-                state_dict = load_state_dict(model_file, variant=variant)
-                model._convert_deprecated_attention_blocks(state_dict)
-                model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
-                    model,
-                    state_dict,
-                    model_file,
-                    pretrained_model_name_or_path,
-                    ignore_mismatched_sizes=ignore_mismatched_sizes,
-                )
-                loading_info = {
-                    "missing_keys": missing_keys,
-                    "unexpected_keys": unexpected_keys,
-                    "mismatched_keys": mismatched_keys,
-                    "error_msgs": error_msgs,
-                }
-        if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype):
-            raise ValueError(
-                f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}."
-            )
-        elif torch_dtype is not None:
-            model = model.to(torch_dtype)
-        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
-        # Set model in evaluation mode to deactivate DropOut modules by default
-        model.eval()
-        if output_loading_info:
-            return model, loading_info
-        return model
-    @classmethod
-    def _load_pretrained_model(
-        cls,
-        model,
-        state_dict,
-        resolved_archive_file,
-        pretrained_model_name_or_path,
-        ignore_mismatched_sizes=False,
-    ):
-        # Retrieve missing & unexpected_keys
-        model_state_dict = model.state_dict()
-        loaded_keys = list(state_dict.keys())
-        expected_keys = list(model_state_dict.keys())
-        original_loaded_keys = loaded_keys
-        missing_keys = list(set(expected_keys) - set(loaded_keys))
-        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
-        # Make sure we are able to load base models as well as derived models (with heads)
-        model_to_load = model
-        def _find_mismatched_keys(
-            state_dict,
-            model_state_dict,
-            loaded_keys,
-            ignore_mismatched_sizes,
-        ):
-            mismatched_keys = []
-            if ignore_mismatched_sizes:
-                for checkpoint_key in loaded_keys:
-                    model_key = checkpoint_key
-                    if (
-                        model_key in model_state_dict
-                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
-                    ):
-                        mismatched_keys.append(
-                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
-                        )
-                        del state_dict[checkpoint_key]
-            return mismatched_keys
-        if state_dict is not None:
-            # Whole checkpoint
-            mismatched_keys = _find_mismatched_keys(
-                state_dict,
-                model_state_dict,
-                original_loaded_keys,
-                ignore_mismatched_sizes,
-            )
-            error_msgs = _load_state_dict_into_model(model_to_load, state_dict)
-        if len(error_msgs) > 0:
-            error_msg = "\n\t".join(error_msgs)
-            if "size mismatch" in error_msg:
-                error_msg += (
-                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
-                )
-            raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
-        if len(unexpected_keys) > 0:
-            logger.warning(
-                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
-                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
-                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
-                " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
-                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
-                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
-                " identical (initializing a BertForSequenceClassification model from a"
-                " BertForSequenceClassification model)."
-            )
-        else:
-            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
-        if len(missing_keys) > 0:
-            logger.warning(
-                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
-                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
-            )
-        elif len(mismatched_keys) == 0:
-            logger.info(
-                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
-                f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
-                " without further training."
-            )
-        if len(mismatched_keys) > 0:
-            mismatched_warning = "\n".join(
-                [
-                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
-                    for key, shape1, shape2 in mismatched_keys
-                ]
-            )
-            logger.warning(
-                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
-                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
-                " able to use it for predictions and inference."
-            )
-        return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
-    @property
-    def device(self) -> device:
-        """
-        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
-        device).
-        """
-        return get_parameter_device(self)
-    @property
-    def dtype(self) -> torch.dtype:
-        """
-        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
-        """
-        return get_parameter_dtype(self)
-    def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
-        """
-        Get number of (optionally, trainable or non-embeddings) parameters in the module.
-        Args:
-            only_trainable (`bool`, *optional*, defaults to `False`):
-                Whether or not to return only the number of trainable parameters
-            exclude_embeddings (`bool`, *optional*, defaults to `False`):
-                Whether or not to return only the number of non-embeddings parameters
-        Returns:
-            `int`: The number of parameters.
-        """
-        if exclude_embeddings:
-            embedding_param_names = [
-                f"{name}.weight"
-                for name, module_type in self.named_modules()
-                if isinstance(module_type, torch.nn.Embedding)
-            ]
-            non_embedding_parameters = [
-                parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
-            ]
-            return sum(p.numel() for p in non_embedding_parameters if p.requires_grad or not only_trainable)
-        else:
-            return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable)
-    def _convert_deprecated_attention_blocks(self, state_dict):
-        deprecated_attention_block_paths = []
-        def recursive_find_attn_block(name, module):
-            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
-                deprecated_attention_block_paths.append(name)
-            for sub_name, sub_module in module.named_children():
-                sub_name = sub_name if name == "" else f"{name}.{sub_name}"
-                recursive_find_attn_block(sub_name, sub_module)
-        recursive_find_attn_block("", self)
-        # NOTE: we have to check if the deprecated parameters are in the state dict
-        # because it is possible we are loading from a state dict that was already
-        # converted
-        for path in deprecated_attention_block_paths:
-            # group_norm path stays the same
-            # query -> to_q
-            if f"{path}.query.weight" in state_dict:
-                state_dict[f"{path}.to_q.weight"] = state_dict.pop(f"{path}.query.weight")
-            if f"{path}.query.bias" in state_dict:
-                state_dict[f"{path}.to_q.bias"] = state_dict.pop(f"{path}.query.bias")
-            # key -> to_k
-            if f"{path}.key.weight" in state_dict:
-                state_dict[f"{path}.to_k.weight"] = state_dict.pop(f"{path}.key.weight")
-            if f"{path}.key.bias" in state_dict:
-                state_dict[f"{path}.to_k.bias"] = state_dict.pop(f"{path}.key.bias")
-            # value -> to_v
-            if f"{path}.value.weight" in state_dict:
-                state_dict[f"{path}.to_v.weight"] = state_dict.pop(f"{path}.value.weight")
-            if f"{path}.value.bias" in state_dict:
-                state_dict[f"{path}.to_v.bias"] = state_dict.pop(f"{path}.value.bias")
-            # proj_attn -> to_out.0
-            if f"{path}.proj_attn.weight" in state_dict:
-                state_dict[f"{path}.to_out.0.weight"] = state_dict.pop(f"{path}.proj_attn.weight")
-            if f"{path}.proj_attn.bias" in state_dict:
-                state_dict[f"{path}.to_out.0.bias"] = state_dict.pop(f"{path}.proj_attn.bias")

models/pipelines.py CHANGED Viewed

@@ -1,12 +1,85 @@
 import torch
 from tqdm import tqdm
 import utils
-from utils import schedule
 from PIL import Image
 import gc
 import numpy as np
 from .attention import GatedSelfAttentionDense
 from .models import process_input_embeddings, torch_device
 @torch.no_grad()
 def encode(model_dict, image, generator):
@@ -53,6 +126,126 @@ def decode(vae, latents):
     return images
 @torch.no_grad()
 def generate(model_dict, latents, input_embeddings, num_inference_steps, guidance_scale = 7.5, no_set_timesteps=False, scheduler_key='dpm_scheduler'):
     vae, tokenizer, text_encoder, unet, scheduler, dtype = model_dict.vae, model_dict.tokenizer, model_dict.text_encoder, model_dict.unet, model_dict[scheduler_key], model_dict.dtype
@@ -132,9 +325,13 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
     frozen_steps=20, frozen_mask=None,
     return_saved_cross_attn=False, saved_cross_attn_keys=None, return_cond_ca_only=False, return_token_ca_only=None,
     offload_cross_attn_to_cpu=False, offload_latents_to_cpu=True,
     return_box_vis=False, show_progress=True, save_all_latents=False, scheduler_key='dpm_scheduler', batched_condition=False, dynamic_num_inference_steps=False, fast_after_steps=None, fast_rate=2):
     """
     The `bboxes` should be a list, rather than a list of lists (one box per phrase, we can have multiple duplicated phrases).
     """
     vae, tokenizer, text_encoder, unet, scheduler, dtype = model_dict.vae, model_dict.tokenizer, model_dict.text_encoder, model_dict.unet, model_dict[scheduler_key], model_dict.dtype
@@ -161,6 +358,9 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
     if fast_after_steps is not None:
         scheduler.timesteps = schedule.get_fast_schedule(scheduler.timesteps, fast_after_steps, fast_rate)
     if frozen_mask is not None:
         frozen_mask = frozen_mask.to(dtype=dtype).clamp(0., 1.)
@@ -171,6 +371,23 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
     boxes, phrase_embeddings, masks, condition_len = prepare_gligen_condition(bboxes, phrases, dtype, tokenizer, text_encoder, num_images_per_prompt)
     if return_saved_cross_attn:
         saved_attns = []
@@ -196,6 +413,9 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
         if index == num_grounding_steps:
             gligen_enable_fuser(unet, False)
         # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
         latent_model_input = torch.cat([latents] * 2)
@@ -215,7 +435,7 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
         # perform guidance
         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
         if dynamic_num_inference_steps:
             schedule.dynamically_adjust_inference_steps(scheduler, index, t)
@@ -225,12 +445,17 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
         if frozen_mask is not None and index < frozen_steps:
             latents = latents_all_input[index+1] * frozen_mask + latents * (1. - frozen_mask)
         if save_all_latents and (fast_after_steps is None or index < fast_after_steps):
             if offload_latents_to_cpu:
                 latents_all.append(latents.cpu())
             else:
                 latents_all.append(latents)
     # Turn off fuser for typical SD
     gligen_enable_fuser(unet, False)
     images = decode(vae, latents)
@@ -247,3 +472,128 @@ def generate_gligen(model_dict, latents, input_embeddings, num_inference_steps,
     return tuple(ret)

 import torch
 from tqdm import tqdm
+from utils import guidance, schedule, boxdiff
 import utils
 from PIL import Image
 import gc
 import numpy as np
 from .attention import GatedSelfAttentionDense
 from .models import process_input_embeddings, torch_device
+import warnings
+# All keys: [('down', 0, 0, 0), ('down', 0, 1, 0), ('down', 1, 0, 0), ('down', 1, 1, 0), ('down', 2, 0, 0), ('down', 2, 1, 0), ('mid', 0, 0, 0), ('up', 1, 0, 0), ('up', 1, 1, 0), ('up', 1, 2, 0), ('up', 2, 0, 0), ('up', 2, 1, 0), ('up', 2, 2, 0), ('up', 3, 0, 0), ('up', 3, 1, 0), ('up', 3, 2, 0)]
+# Note that the first up block is `UpBlock2D` rather than `CrossAttnUpBlock2D` and does not have attention. The last index is always 0 in our case since we have one `BasicTransformerBlock` in each `Transformer2DModel`.
+DEFAULT_GUIDANCE_ATTN_KEYS = [("mid", 0, 0, 0), ("up", 1, 0, 0), ("up", 1, 1, 0), ("up", 1, 2, 0)]
+def latent_backward_guidance(scheduler, unet, cond_embeddings, index, bboxes, object_positions, t, latents, loss, loss_scale = 30, loss_threshold = 0.2, max_iter = 5, max_index_step = 10, cross_attention_kwargs=None, ref_ca_saved_attns=None, guidance_attn_keys=None, verbose=False, clear_cache=False, **kwargs):
+    iteration = 0
+    if index < max_index_step:
+        if isinstance(max_iter, list):
+            if len(max_iter) > index:
+                max_iter = max_iter[index]
+            else:
+                max_iter = max_iter[-1]
+        if verbose:
+            print(f"time index {index}, loss: {loss.item()/loss_scale:.3f} (de-scaled with scale {loss_scale:.1f}), loss threshold: {loss_threshold:.3f}")
+        while (loss.item() / loss_scale > loss_threshold and iteration < max_iter and index < max_index_step):
+            saved_attn = {}
+            full_cross_attention_kwargs = {
+                'save_attn_to_dict': saved_attn,
+                'save_keys': guidance_attn_keys,
+            }
+            if cross_attention_kwargs is not None:
+                full_cross_attention_kwargs.update(cross_attention_kwargs)
+            latents.requires_grad_(True)
+            latent_model_input = latents
+            latent_model_input = scheduler.scale_model_input(latent_model_input, t)
+            unet(latent_model_input, t, encoder_hidden_states=cond_embeddings, return_cross_attention_probs=False, cross_attention_kwargs=full_cross_attention_kwargs)
+            # TODO: could return the attention maps for the required blocks only and not necessarily the final output
+            # update latents with guidance
+            loss = guidance.compute_ca_lossv3(saved_attn=saved_attn, bboxes=bboxes, object_positions=object_positions, guidance_attn_keys=guidance_attn_keys, ref_ca_saved_attns=ref_ca_saved_attns, index=index, verbose=verbose, **kwargs) * loss_scale
+            if torch.isnan(loss):
+                print("**Loss is NaN**")
+            del full_cross_attention_kwargs, saved_attn
+            # call gc.collect() here may release some memory
+            grad_cond = torch.autograd.grad(loss.requires_grad_(True), [latents])[0]
+            latents.requires_grad_(False)
+            if hasattr(scheduler, 'sigmas'):
+                latents = latents - grad_cond * scheduler.sigmas[index] ** 2
+            elif hasattr(scheduler, 'alphas_cumprod'):
+                warnings.warn("Using guidance scaled with alphas_cumprod")
+                # Scaling with classifier guidance
+                alpha_prod_t = scheduler.alphas_cumprod[t]
+                # Classifier guidance: https://arxiv.org/pdf/2105.05233.pdf
+                # DDIM: https://arxiv.org/pdf/2010.02502.pdf
+                scale = (1 - alpha_prod_t) ** (0.5)
+                latents = latents - scale * grad_cond
+            else:
+                # NOTE: no scaling is performed
+                warnings.warn("No scaling in guidance is performed")
+                latents = latents - grad_cond
+            iteration += 1
+            if clear_cache:
+                utils.free_memory()
+            if verbose:
+                print(f"time index {index}, loss: {loss.item()/loss_scale:.3f}, loss threshold: {loss_threshold:.3f}, iteration: {iteration}")
+    return latents, loss
 @torch.no_grad()
 def encode(model_dict, image, generator):
     return images
+def generate_semantic_guidance(model_dict, latents, input_embeddings, num_inference_steps, bboxes, phrases, object_positions, guidance_scale = 7.5, semantic_guidance_kwargs=None,
+                           return_cross_attn=False, return_saved_cross_attn=False, saved_cross_attn_keys=None, return_cond_ca_only=False, return_token_ca_only=None, offload_guidance_cross_attn_to_cpu=False,
+                           offload_cross_attn_to_cpu=False, offload_latents_to_cpu=True, return_box_vis=False, show_progress=True, save_all_latents=False,
+                           dynamic_num_inference_steps=False, fast_after_steps=None, fast_rate=2, use_boxdiff=False):
+    """
+    object_positions: object indices in text tokens
+    return_cross_attn: should be deprecated. Use `return_saved_cross_attn` and the new format.
+    """
+    vae, tokenizer, text_encoder, unet, scheduler, dtype = model_dict.vae, model_dict.tokenizer, model_dict.text_encoder, model_dict.unet, model_dict.scheduler, model_dict.dtype
+    text_embeddings, uncond_embeddings, cond_embeddings = input_embeddings
+    # Just in case that we have in-place ops
+    latents = latents.clone()
+    if save_all_latents:
+        # offload to cpu to save space
+        if offload_latents_to_cpu:
+            latents_all = [latents.cpu()]
+        else:
+            latents_all = [latents]
+    scheduler.set_timesteps(num_inference_steps)
+    if fast_after_steps is not None:
+        scheduler.timesteps = schedule.get_fast_schedule(scheduler.timesteps, fast_after_steps, fast_rate)
+    if dynamic_num_inference_steps:
+        original_num_inference_steps = scheduler.num_inference_steps
+    cross_attention_probs_down = []
+    cross_attention_probs_mid = []
+    cross_attention_probs_up = []
+    loss = torch.tensor(10000.)
+    # TODO: we can also save necessary tokens only to save memory.
+    # offload_guidance_cross_attn_to_cpu does not save too much since we only store attention map for each timestep.
+    guidance_cross_attention_kwargs = {
+        'offload_cross_attn_to_cpu': offload_guidance_cross_attn_to_cpu,
+        'enable_flash_attn': False
+    }
+    if return_saved_cross_attn:
+        saved_attns = []
+    main_cross_attention_kwargs = {
+        'offload_cross_attn_to_cpu': offload_cross_attn_to_cpu,
+        'return_cond_ca_only': return_cond_ca_only,
+        'return_token_ca_only': return_token_ca_only,
+        'save_keys': saved_cross_attn_keys,
+    }
+    # Repeating keys leads to different weights for each key.
+    # assert len(set(semantic_guidance_kwargs['guidance_attn_keys'])) == len(semantic_guidance_kwargs['guidance_attn_keys']), f"guidance_attn_keys not unique: {semantic_guidance_kwargs['guidance_attn_keys']}"
+    for index, t in enumerate(tqdm(scheduler.timesteps, disable=not show_progress)):
+        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+        if bboxes:
+            if use_boxdiff:
+                latents, loss = boxdiff.latent_backward_guidance_boxdiff(scheduler, unet, cond_embeddings, index, bboxes, object_positions, t, latents, loss, cross_attention_kwargs=guidance_cross_attention_kwargs, **semantic_guidance_kwargs)
+            else:
+                # If encountered None in `guidance_attn_keys`, please be sure to check whether `guidance_attn_keys` is added in `semantic_guidance_kwargs`. Default value has been removed.
+                latents, loss = latent_backward_guidance(scheduler, unet, cond_embeddings, index, bboxes, object_positions, t, latents, loss, cross_attention_kwargs=guidance_cross_attention_kwargs, **semantic_guidance_kwargs)
+        # predict the noise residual
+        with torch.no_grad():
+            latent_model_input = torch.cat([latents] * 2)
+            latent_model_input = scheduler.scale_model_input(latent_model_input, timestep=t)
+            main_cross_attention_kwargs['save_attn_to_dict'] = {}
+            unet_output = unet(latent_model_input, t, encoder_hidden_states=text_embeddings, return_cross_attention_probs=return_cross_attn, cross_attention_kwargs=main_cross_attention_kwargs)
+            noise_pred = unet_output.sample
+            if return_cross_attn:
+                cross_attention_probs_down.append(unet_output.cross_attention_probs_down)
+                cross_attention_probs_mid.append(unet_output.cross_attention_probs_mid)
+                cross_attention_probs_up.append(unet_output.cross_attention_probs_up)
+            if return_saved_cross_attn:
+                saved_attns.append(main_cross_attention_kwargs['save_attn_to_dict'])
+                del main_cross_attention_kwargs['save_attn_to_dict']
+        # perform guidance
+        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        if dynamic_num_inference_steps:
+            schedule.dynamically_adjust_inference_steps(scheduler, index, t)
+        # compute the previous noisy sample x_t -> x_t-1
+        latents = scheduler.step(noise_pred, t, latents).prev_sample
+        if save_all_latents:
+            if offload_latents_to_cpu:
+                latents_all.append(latents.cpu())
+            else:
+                latents_all.append(latents)
+    if dynamic_num_inference_steps:
+        # Restore num_inference_steps to avoid confusion in the next generation if it is not dynamic
+        scheduler.num_inference_steps = original_num_inference_steps
+    images = decode(vae, latents)
+    ret = [latents, images]
+    if return_cross_attn:
+        ret.append((cross_attention_probs_down, cross_attention_probs_mid, cross_attention_probs_up))
+    if return_saved_cross_attn:
+        ret.append(saved_attns)
+    if return_box_vis:
+        pil_images = [utils.draw_box(Image.fromarray(image), bboxes, phrases) for image in images]
+        ret.append(pil_images)
+    if save_all_latents:
+        latents_all = torch.stack(latents_all, dim=0)
+        ret.append(latents_all)
+    return tuple(ret)
 @torch.no_grad()
 def generate(model_dict, latents, input_embeddings, num_inference_steps, guidance_scale = 7.5, no_set_timesteps=False, scheduler_key='dpm_scheduler'):
     vae, tokenizer, text_encoder, unet, scheduler, dtype = model_dict.vae, model_dict.tokenizer, model_dict.text_encoder, model_dict.unet, model_dict[scheduler_key], model_dict.dtype
     frozen_steps=20, frozen_mask=None,
     return_saved_cross_attn=False, saved_cross_attn_keys=None, return_cond_ca_only=False, return_token_ca_only=None,
     offload_cross_attn_to_cpu=False, offload_latents_to_cpu=True,
+    semantic_guidance=False, semantic_guidance_bboxes=None, semantic_guidance_object_positions=None, semantic_guidance_kwargs=None,
     return_box_vis=False, show_progress=True, save_all_latents=False, scheduler_key='dpm_scheduler', batched_condition=False, dynamic_num_inference_steps=False, fast_after_steps=None, fast_rate=2):
     """
     The `bboxes` should be a list, rather than a list of lists (one box per phrase, we can have multiple duplicated phrases).
+    batched:
+        Enabled: bboxes and phrases should be a list (batch dimension) of items (specify the bboxes/phrases of each image in the batch).
+        Disabled: bboxes and phrases should be a list of bboxes and phrases specifying the bboxes/phrases of one image (no batch dimension).
     """
     vae, tokenizer, text_encoder, unet, scheduler, dtype = model_dict.vae, model_dict.tokenizer, model_dict.text_encoder, model_dict.unet, model_dict[scheduler_key], model_dict.dtype
     if fast_after_steps is not None:
         scheduler.timesteps = schedule.get_fast_schedule(scheduler.timesteps, fast_after_steps, fast_rate)
+    if dynamic_num_inference_steps:
+        original_num_inference_steps = scheduler.num_inference_steps
     if frozen_mask is not None:
         frozen_mask = frozen_mask.to(dtype=dtype).clamp(0., 1.)
     boxes, phrase_embeddings, masks, condition_len = prepare_gligen_condition(bboxes, phrases, dtype, tokenizer, text_encoder, num_images_per_prompt)
+    if semantic_guidance_bboxes and semantic_guidance:
+        loss = torch.tensor(10000.)
+        # TODO: we can also save necessary tokens only to save memory.
+        # offload_guidance_cross_attn_to_cpu does not save too much since we only store attention map for each timestep.
+        guidance_cross_attention_kwargs = {
+            'offload_cross_attn_to_cpu': False,
+            'enable_flash_attn': False,
+            'gligen': {
+                'boxes': boxes[:condition_len // 2],
+                'positive_embeddings': phrase_embeddings[:condition_len // 2],
+                'masks': masks[:condition_len // 2],
+                'fuser_attn_kwargs': {
+                    'enable_flash_attn': False,
+                }
+            }
+        }
     if return_saved_cross_attn:
         saved_attns = []
         if index == num_grounding_steps:
             gligen_enable_fuser(unet, False)
+        if semantic_guidance_bboxes and semantic_guidance:
+            with torch.enable_grad():
+                latents, loss = latent_backward_guidance(scheduler, unet, cond_embeddings, index, semantic_guidance_bboxes, semantic_guidance_object_positions, t, latents, loss, cross_attention_kwargs=guidance_cross_attention_kwargs, **semantic_guidance_kwargs)
         # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
         latent_model_input = torch.cat([latents] * 2)
         # perform guidance
         noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
         noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
         if dynamic_num_inference_steps:
             schedule.dynamically_adjust_inference_steps(scheduler, index, t)
         if frozen_mask is not None and index < frozen_steps:
             latents = latents_all_input[index+1] * frozen_mask + latents * (1. - frozen_mask)
+        # Do not save the latents in the fast steps
         if save_all_latents and (fast_after_steps is None or index < fast_after_steps):
             if offload_latents_to_cpu:
                 latents_all.append(latents.cpu())
             else:
                 latents_all.append(latents)
+    if dynamic_num_inference_steps:
+        # Restore num_inference_steps to avoid confusion in the next generation if it is not dynamic
+        scheduler.num_inference_steps = original_num_inference_steps
     # Turn off fuser for typical SD
     gligen_enable_fuser(unet, False)
     images = decode(vae, latents)
     return tuple(ret)
+def get_inverse_timesteps(inverse_scheduler, num_inference_steps, strength):
+    # get the original timestep using init_timestep
+    init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+    t_start = max(num_inference_steps - init_timestep, 0)
+    # safety for t_start overflow to prevent empty timsteps slice
+    if t_start == 0:
+        return inverse_scheduler.timesteps, num_inference_steps
+    timesteps = inverse_scheduler.timesteps[:-t_start]
+    return timesteps, num_inference_steps - t_start
+@torch.no_grad()
+def invert(model_dict, latents, input_embeddings, num_inference_steps, guidance_scale = 7.5):
+    """
+    latents: encoded from the image, should not have noise (t = 0)
+    returns inverted_latents for all time steps
+    """
+    vae, tokenizer, text_encoder, unet, scheduler, inverse_scheduler, dtype = model_dict.vae, model_dict.tokenizer, model_dict.text_encoder, model_dict.unet, model_dict.scheduler, model_dict.inverse_scheduler, model_dict.dtype
+    text_embeddings, uncond_embeddings, cond_embeddings = input_embeddings
+    inverse_scheduler.set_timesteps(num_inference_steps, device=latents.device)
+    # We need to invert all steps because we need them to generate the background.
+    timesteps, num_inference_steps = get_inverse_timesteps(inverse_scheduler, num_inference_steps, strength=1.0)
+    inverted_latents = [latents.cpu()]
+    for t in tqdm(timesteps[:-1]):
+        # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+        if guidance_scale > 0.:
+            latent_model_input = torch.cat([latents] * 2)
+            latent_model_input = inverse_scheduler.scale_model_input(latent_model_input, timestep=t)
+            # predict the noise residual
+            with torch.no_grad():
+                noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+            # perform guidance
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+        else:
+            latent_model_input = latents
+            latent_model_input = inverse_scheduler.scale_model_input(latent_model_input, timestep=t)
+            # predict the noise residual
+            with torch.no_grad():
+                noise_pred_uncond = unet(latent_model_input, t, encoder_hidden_states=uncond_embeddings).sample
+            # perform guidance
+            noise_pred = noise_pred_uncond
+        # compute the previous noisy sample x_t -> x_t-1
+        latents = inverse_scheduler.step(noise_pred, t, latents).prev_sample
+        inverted_latents.append(latents.cpu())
+    assert len(inverted_latents) == len(timesteps)
+    # timestep is the first dimension
+    inverted_latents = torch.stack(list(reversed(inverted_latents)), dim=0)
+    return inverted_latents
+def generate_partial_frozen(model_dict, latents_all, frozen_mask, input_embeddings, num_inference_steps, frozen_steps, guidance_scale = 7.5, bboxes=None, phrases=None, object_positions=None, semantic_guidance_kwargs=None, offload_guidance_cross_attn_to_cpu=False, use_boxdiff=False):
+    vae, tokenizer, text_encoder, unet, scheduler, dtype = model_dict.vae, model_dict.tokenizer, model_dict.text_encoder, model_dict.unet, model_dict.scheduler, model_dict.dtype
+    text_embeddings, uncond_embeddings, cond_embeddings = input_embeddings
+    scheduler.set_timesteps(num_inference_steps)
+    frozen_mask = frozen_mask.to(dtype=dtype).clamp(0., 1.)
+    latents = latents_all[0]
+    if bboxes:
+        # With semantic guidance
+        loss = torch.tensor(10000.)
+        # offload_guidance_cross_attn_to_cpu does not save too much since we only store attention map for each timestep.
+        guidance_cross_attention_kwargs = {
+            'offload_cross_attn_to_cpu': offload_guidance_cross_attn_to_cpu,
+            # Getting invalid argument on backward, probably due to insufficient shared memory
+            'enable_flash_attn': False
+        }
+    for index, t in enumerate(tqdm(scheduler.timesteps)):
+        if bboxes:
+            # With semantic guidance, `guidance_attn_keys` should be in `semantic_guidance_kwargs`
+            if use_boxdiff:
+                latents, loss = boxdiff.latent_backward_guidance_boxdiff(scheduler, unet, cond_embeddings, index, bboxes, object_positions, t, latents, loss, cross_attention_kwargs=guidance_cross_attention_kwargs, **semantic_guidance_kwargs)
+            else:
+                latents, loss = latent_backward_guidance(scheduler, unet, cond_embeddings, index, bboxes, object_positions, t, latents, loss, cross_attention_kwargs=guidance_cross_attention_kwargs, **semantic_guidance_kwargs)
+        with torch.no_grad():
+            # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+            latent_model_input = torch.cat([latents] * 2)
+            latent_model_input = scheduler.scale_model_input(latent_model_input, timestep=t)
+            # predict the noise residual
+            noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+            # perform guidance
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = scheduler.step(noise_pred, t, latents).prev_sample
+            if index < frozen_steps:
+                latents = latents_all[index+1] * frozen_mask + latents * (1. - frozen_mask)
+    # scale and decode the image latents with vae
+    scaled_latents = 1 / 0.18215 * latents
+    with torch.no_grad():
+        image = vae.decode(scaled_latents).sample
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+    images = (image * 255).round().astype("uint8")
+    ret = [latents, images]
+    return tuple(ret)

models/sam.py CHANGED Viewed

@@ -164,8 +164,10 @@ def sam_refine_attn(sam_input_image, token_attn_np, model_dict, height, width, H
     return mask_selected, conf_score_selected
 def sam_refine_box(sam_input_image, box, *args, **kwargs):
-    sam_input_images, boxes = [sam_input_image], [box]
-    return sam_refine_boxes(sam_input_images, boxes, *args, **kwargs)
 def sam_refine_boxes(sam_input_images, boxes, model_dict, height, width, H, W, discourage_mask_below_confidence, discourage_mask_below_coarse_iou, verbose):
     # (w, h)

     return mask_selected, conf_score_selected
 def sam_refine_box(sam_input_image, box, *args, **kwargs):
+    # One image with one box
+    sam_input_images, boxes = [sam_input_image], [[box]]
+    mask_selected_batched_list, conf_score_selected_batched_list = sam_refine_boxes(sam_input_images, boxes, *args, **kwargs)
+    return mask_selected_batched_list[0][0], conf_score_selected_batched_list[0][0]
 def sam_refine_boxes(sam_input_images, boxes, model_dict, height, width, H, W, discourage_mask_below_confidence, discourage_mask_below_coarse_iou, verbose):
     # (w, h)

utils/attn.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# visualization-related functions are in vis
+import numbers
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+import utils
+def get_token_attnv2(token_id, saved_attns, attn_key, visualize_step_start=10, input_ca_has_condition_only=False, return_np=False):
+    """
+    saved_attns: a list of saved_attn (list is across timesteps)
+    moves to cpu by default
+    """
+    saved_attns = saved_attns[visualize_step_start:]
+    saved_attns = [saved_attn[attn_key].cpu() for saved_attn in saved_attns]
+    attn = torch.stack(saved_attns, dim=0).mean(dim=0)
+    # print("attn shape", attn.shape)
+    # attn: (batch, head, spatial, text)
+    if not input_ca_has_condition_only:
+        assert attn.shape[0] == 2, f"Expect to have 2 items (uncond and cond), but found {attn.shape[0]} items"
+        attn = attn[1]
+    else:
+        assert attn.shape[0] == 1, f"Expect to have 1 item (cond only), but found {attn.shape[0]} items"
+        attn = attn[0]
+    attn = attn.mean(dim=0)[:, token_id]
+    H = W = int(math.sqrt(attn.shape[0]))
+    attn = attn.reshape((H, W))
+    if return_np:
+        return attn.numpy()
+    return attn
+def shift_saved_attns_item(saved_attns_item, offset, guidance_attn_keys, horizontal_shift_only=False):
+    """
+    `horizontal_shift_only`: only shift horizontally. If you use `offset` from `compose_latents_with_alignment` with `horizontal_shift_only=True`, the `offset` already has y_offset = 0 and this option is not needed.
+    """
+    x_offset, y_offset = offset
+    if horizontal_shift_only:
+        y_offset = 0.
+    new_saved_attns_item = {}
+    for k in guidance_attn_keys:
+        attn_map = saved_attns_item[k]
+        attn_size = attn_map.shape[-2]
+        attn_h = attn_w = int(math.sqrt(attn_size))
+        # Example dimensions: [batch_size, num_heads, 8, 8, num_tokens]
+        attn_map = attn_map.unflatten(2, (attn_h, attn_w))
+        attn_map = utils.shift_tensor(
+            attn_map, x_offset, y_offset,
+            offset_normalized=True, ignore_last_dim=True
+        )
+        attn_map = attn_map.flatten(2, 3)
+        new_saved_attns_item[k] = attn_map
+    return new_saved_attns_item
+def shift_saved_attns(saved_attns, offset, guidance_attn_keys, **kwargs):
+    # Iterate over timesteps
+    shifted_saved_attns = [shift_saved_attns_item(saved_attns_item, offset, guidance_attn_keys, **kwargs) for saved_attns_item in saved_attns]
+    return shifted_saved_attns
+class GaussianSmoothing(nn.Module):
+    """
+    Apply gaussian smoothing on a
+    1d, 2d or 3d tensor. Filtering is performed seperately for each channel
+    in the input using a depthwise convolution.
+    Arguments:
+        channels (int, sequence): Number of channels of the input tensors. Output will
+            have this number of channels as well.
+        kernel_size (int, sequence): Size of the gaussian kernel.
+        sigma (float, sequence): Standard deviation of the gaussian kernel.
+        dim (int, optional): The number of dimensions of the data.
+            Default value is 2 (spatial).
+    Credit: https://discuss.pytorch.org/t/is-there-anyway-to-do-gaussian-filtering-for-an-image-2d-3d-in-pytorch/12351/10
+    """
+    def __init__(self, channels, kernel_size, sigma, dim=2):
+        super(GaussianSmoothing, self).__init__()
+        if isinstance(kernel_size, numbers.Number):
+            kernel_size = [kernel_size] * dim
+        if isinstance(sigma, numbers.Number):
+            sigma = [sigma] * dim
+        # The gaussian kernel is the product of the
+        # gaussian function of each dimension.
+        kernel = 1
+        meshgrids = torch.meshgrid(
+            [
+                torch.arange(size, dtype=torch.float32)
+                for size in kernel_size
+            ]
+        )
+        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
+            mean = (size - 1) / 2
+            kernel *= 1 / (std * math.sqrt(2 * math.pi)) * \
+                torch.exp(-((mgrid - mean) / (2 * std)) ** 2)
+        # Make sure sum of values in gaussian kernel equals 1.
+        kernel = kernel / torch.sum(kernel)
+        # Reshape to depthwise convolutional weight
+        kernel = kernel.view(1, 1, *kernel.size())
+        kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))
+        self.register_buffer('weight', kernel)
+        self.groups = channels
+        if dim == 1:
+            self.conv = F.conv1d
+        elif dim == 2:
+            self.conv = F.conv2d
+        elif dim == 3:
+            self.conv = F.conv3d
+        else:
+            raise RuntimeError(
+                'Only 1, 2 and 3 dimensions are supported. Received {}.'.format(
+                    dim)
+            )
+    def forward(self, input):
+        """
+        Apply gaussian filter to input.
+        Arguments:
+            input (torch.Tensor): Input to apply gaussian filter on.
+        Returns:
+            filtered (torch.Tensor): Filtered output.
+        """
+        return self.conv(input, weight=self.weight.to(input.dtype), groups=self.groups)

utils/boxdiff.py ADDED Viewed

	@@ -0,0 +1,259 @@

+"""
+This is an reimplementation boxdiff baseline for reference and comparison. It is not used in the Web UI and not enabled by default since the current attention guidance implementation (in `guidance`), which uses attention maps from multiple levels and attention transfer, seems to be more robust and coherent.
+Credit: https://github.com/showlab/BoxDiff/blob/master/pipeline/sd_pipeline_boxdiff.py
+"""
+import torch
+import torch.nn.functional as F
+import math
+import warnings
+import gc
+from collections.abc import Iterable
+import utils
+from . import guidance
+from .attn import GaussianSmoothing
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
+def _compute_max_attention_per_index(attention_maps: torch.Tensor,
+                                     object_positions: List[List[int]],
+                                     smooth_attentions: bool = False,
+                                     sigma: float = 0.5,
+                                     kernel_size: int = 3,
+                                     normalize_eot: bool = False,
+                                     bboxes: List[List[int]] = None,
+                                     P: float = 0.2,
+                                     L: int = 1,
+                                     ) -> List[torch.Tensor]:
+    """ Computes the maximum attention value for each of the tokens we wish to alter. """
+    last_idx = -1
+    assert not normalize_eot, "normalize_eot is unimplemented"
+    attention_for_text = attention_maps[:, :, 1:last_idx]
+    attention_for_text *= 100
+    attention_for_text = F.softmax(attention_for_text, dim=-1)
+    # Extract the maximum values
+    max_indices_list_fg = []
+    max_indices_list_bg = []
+    dist_x = []
+    dist_y = []
+    for obj_idx, text_positions_per_obj in enumerate(object_positions):
+        for text_position_per_obj in text_positions_per_obj:
+            # Shift indices since we removed the first token
+            image = attention_for_text[:, :, text_position_per_obj - 1]
+            H, W = image.shape
+            obj_mask = torch.zeros_like(image)
+            corner_mask_x = torch.zeros(
+                (W,), device=obj_mask.device, dtype=obj_mask.dtype)
+            corner_mask_y = torch.zeros(
+                (H,), device=obj_mask.device, dtype=obj_mask.dtype)
+            obj_boxes = bboxes[obj_idx]
+            # We support two level (one box per phrase) and three level (multiple boxes per phrase)
+            if not isinstance(obj_boxes[0], Iterable):
+                obj_boxes = [obj_boxes]
+            for obj_box in obj_boxes:
+                x_min, y_min, x_max, y_max = utils.scale_proportion(
+                    obj_box, H=H, W=W)
+                obj_mask[y_min: y_max, x_min: x_max] = 1
+                corner_mask_x[max(x_min - L, 0): min(x_min + L + 1, W)] = 1.
+                corner_mask_x[max(x_max - L, 0): min(x_max + L + 1, W)] = 1.
+                corner_mask_y[max(y_min - L, 0): min(y_min + L + 1, H)] = 1.
+                corner_mask_y[max(y_max - L, 0): min(y_max + L + 1, H)] = 1.
+            bg_mask = 1 - obj_mask
+            if smooth_attentions:
+                smoothing = GaussianSmoothing(
+                    channels=1, kernel_size=kernel_size, sigma=sigma, dim=2).cuda()
+                input = F.pad(image.unsqueeze(0).unsqueeze(0),
+                              (1, 1, 1, 1), mode='reflect')
+                image = smoothing(input).squeeze(0).squeeze(0)
+            # Inner-Box constraint
+            k = (obj_mask.sum() * P).long()
+            max_indices_list_fg.append(
+                (image * obj_mask).reshape(-1).topk(k)[0].mean())
+            # Outer-Box constraint
+            k = (bg_mask.sum() * P).long()
+            max_indices_list_bg.append(
+                (image * bg_mask).reshape(-1).topk(k)[0].mean())
+            # Corner Constraint
+            gt_proj_x = torch.max(obj_mask, dim=0).values
+            gt_proj_y = torch.max(obj_mask, dim=1).values
+            # create gt according to the number L
+            dist_x.append((F.l1_loss(image.max(dim=0)[
+                          0], gt_proj_x, reduction='none') * corner_mask_x).mean())
+            dist_y.append((F.l1_loss(image.max(dim=1)[
+                          0], gt_proj_y, reduction='none') * corner_mask_y).mean())
+    return max_indices_list_fg, max_indices_list_bg, dist_x, dist_y
+def _compute_loss(max_attention_per_index_fg: List[torch.Tensor], max_attention_per_index_bg: List[torch.Tensor],
+                  dist_x: List[torch.Tensor], dist_y: List[torch.Tensor], return_losses: bool = False) -> torch.Tensor:
+    """ Computes the attend-and-excite loss using the maximum attention value for each token. """
+    losses_fg = [max(0, 1. - curr_max)
+                 for curr_max in max_attention_per_index_fg]
+    losses_bg = [max(0, curr_max) for curr_max in max_attention_per_index_bg]
+    loss = sum(losses_fg) + sum(losses_bg) + sum(dist_x) + sum(dist_y)
+    # print(f"{losses_fg}, {losses_bg}, {dist_x}, {dist_y}, {loss}")
+    if return_losses:
+        return max(losses_fg), losses_fg
+    else:
+        return max(losses_fg), loss
+def compute_ca_loss_boxdiff(saved_attn, bboxes, object_positions, guidance_attn_keys, ref_ca_saved_attns=None, ref_ca_last_token_only=True, ref_ca_word_token_only=False, word_token_indices=None, index=None, ref_ca_loss_weight=1.0, verbose=False, **kwargs):
+    """
+    v3 is equivalent to v2 but with new dictionary format for attention maps.
+    The `saved_attn` is supposed to be passed to `save_attn_to_dict` in `cross_attention_kwargs` prior to computing ths loss.
+    `AttnProcessor` will put attention maps into the `save_attn_to_dict`.
+    `index` is the timestep.
+    `ref_ca_word_token_only`: This has precedence over `ref_ca_last_token_only` (i.e., if both are enabled, we take the token from word rather than the last token).
+    `ref_ca_last_token_only`: `ref_ca_saved_attn` comes from the attention map of the last token of the phrase in single object generation, so we apply it only to the last token of the phrase in overall generation if this is set to True. If set to False, `ref_ca_saved_attn` will be applied to all the text tokens.
+    """
+    loss = torch.tensor(0).float().cuda()
+    object_number = len(bboxes)
+    if object_number == 0:
+        return loss
+    attn_map_list = []
+    for attn_key in guidance_attn_keys:
+        # We only have 1 cross attention for mid.
+        attn_map_integrated = saved_attn[attn_key]
+        if not attn_map_integrated.is_cuda:
+            attn_map_integrated = attn_map_integrated.cuda()
+        # Example dimension: [20, 64, 77]
+        attn_map = attn_map_integrated.squeeze(dim=0)
+        attn_map_list.append(attn_map)
+    # This averages both across layers and across attention heads
+    attn_map = torch.cat(attn_map_list, dim=0).mean(dim=0)
+    loss = add_ca_loss_per_attn_map_to_loss_boxdiff(
+        loss, attn_map, object_number, bboxes, object_positions, verbose=verbose, **kwargs)
+    if ref_ca_saved_attns is not None:
+        warnings.warn('Attention reference loss is enabled in boxdiff mode. The original boxdiff does not have attention reference loss.')
+        ref_loss = torch.tensor(0).float().cuda()
+        ref_loss = guidance.add_ref_ca_loss_per_attn_map_to_lossv2(
+            ref_loss, saved_attn=saved_attn, object_number=object_number, bboxes=bboxes, object_positions=object_positions, guidance_attn_keys=guidance_attn_keys,
+            ref_ca_saved_attns=ref_ca_saved_attns, ref_ca_last_token_only=ref_ca_last_token_only, ref_ca_word_token_only=ref_ca_word_token_only, word_token_indices=word_token_indices, verbose=verbose, index=index, loss_weight=ref_ca_loss_weight
+        )
+        print(f"loss {loss.item():.3f}, reference attention loss (weighted) {ref_loss.item():.3f}")
+        loss += ref_loss
+    return loss
+def add_ca_loss_per_attn_map_to_loss_boxdiff(original_loss, attention_maps, object_number, bboxes, object_positions, P=0.2, L=1, smooth_attentions=True, sigma=0.5, kernel_size=3, normalize_eot=False, verbose=False):
+    # NOTE: normalize_eot is enabled in SD v2.1 in boxdiff
+    i, j = attention_maps.shape
+    H = W = int(math.sqrt(i))
+    attention_maps = attention_maps.view(H, W, j)
+    # attention_maps is aggregated cross attn map across layers and steps
+    # attention_maps shape: [H, W, 77]
+    max_attention_per_index_fg, max_attention_per_index_bg, dist_x, dist_y = _compute_max_attention_per_index(
+        attention_maps=attention_maps,
+        object_positions=object_positions,
+        smooth_attentions=smooth_attentions,
+        sigma=sigma,
+        kernel_size=kernel_size,
+        normalize_eot=normalize_eot,
+        bboxes=bboxes,
+        P=P,
+        L=L
+    )
+    _, loss = _compute_loss(max_attention_per_index_fg,
+                            max_attention_per_index_bg, dist_x, dist_y)
+    return original_loss + loss
+def latent_backward_guidance_boxdiff(scheduler, unet, cond_embeddings, index, bboxes, object_positions, t, latents, loss, amp_loss_scale=10, latent_scale=20, scale_range=(1., 0.5), max_index_step=25, cross_attention_kwargs=None, ref_ca_saved_attns=None, guidance_attn_keys=None, verbose=False, **kwargs):
+    """
+    amp_loss_scale: this scales the loss but will de-scale before applying for latents. This is to prevent overflow/underflow with amp, not to adjust the update step size.
+    latent_scale: this scales the step size for update (scale_factor in boxdiff).
+    """
+    if index < max_index_step:
+        saved_attn = {}
+        full_cross_attention_kwargs = {
+            'save_attn_to_dict': saved_attn,
+            'save_keys': guidance_attn_keys,
+        }
+        if cross_attention_kwargs is not None:
+            full_cross_attention_kwargs.update(cross_attention_kwargs)
+        latents.requires_grad_(True)
+        latent_model_input = latents
+        latent_model_input = scheduler.scale_model_input(latent_model_input, t)
+        unet(latent_model_input, t, encoder_hidden_states=cond_embeddings,
+             return_cross_attention_probs=False, cross_attention_kwargs=full_cross_attention_kwargs)
+        # TODO: could return the attention maps for the required blocks only and not necessarily the final output
+        # update latents with guidance
+        loss = compute_ca_loss_boxdiff(saved_attn=saved_attn, bboxes=bboxes, object_positions=object_positions, guidance_attn_keys=guidance_attn_keys,
+                                       ref_ca_saved_attns=ref_ca_saved_attns, index=index, verbose=verbose, **kwargs) * amp_loss_scale
+        if torch.isnan(loss):
+            print("**Loss is NaN**")
+        del full_cross_attention_kwargs, saved_attn
+        # call gc.collect() here may release some memory
+        grad_cond = torch.autograd.grad(
+            loss.requires_grad_(True), [latents])[0]
+        latents.requires_grad_(False)
+        if True:
+            warnings.warn("Using guidance scaled with sqrt scale")
+            # According to boxdiff's implementation: https://github.com/Sierkinhane/BoxDiff/blob/16ffb677a9128128e04553a0200870a526731be0/pipeline/sd_pipeline_boxdiff.py#L616
+            scale = (scale_range[0] + (scale_range[1] - scale_range[0])
+                     * index / (len(scheduler.timesteps) - 1)) ** (0.5)
+            latents = latents - latent_scale * scale / amp_loss_scale * grad_cond
+        elif hasattr(scheduler, 'sigmas'):
+            warnings.warn("Using guidance scaled with sigmas")
+            scale = scheduler.sigmas[index] ** 2
+            latents = latents - grad_cond * scale
+        elif hasattr(scheduler, 'alphas_cumprod'):
+            warnings.warn("Using guidance scaled with alphas_cumprod")
+            # Scaling with classifier guidance
+            alpha_prod_t = scheduler.alphas_cumprod[t]
+            # Classifier guidance: https://arxiv.org/pdf/2105.05233.pdf
+            # DDIM: https://arxiv.org/pdf/2010.02502.pdf
+            scale = (1 - alpha_prod_t) ** (0.5)
+            latents = latents - latent_scale * scale / amp_loss_scale * grad_cond
+        else:
+            warnings.warn("No scaling in guidance is performed")
+            scale = 1
+            latents = latents - grad_cond
+        gc.collect()
+        torch.cuda.empty_cache()
+        if verbose:
+            print(
+                f"time index {index}, loss: {loss.item() / amp_loss_scale:.3f} (de-scaled with scale {amp_loss_scale:.1f}), latent grad scale: {scale:.3f}")
+    return latents, loss

utils/guidance.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import torch
+import torch.nn.functional as F
+import math
+from collections.abc import Iterable
+import warnings
+import utils
+# A list mapping: prompt index to str (prompt in a list of token str)
+def get_token_map(tokenizer, prompt, verbose=False, padding="do_not_pad"):
+    fg_prompt_tokens = tokenizer([prompt], padding=padding, max_length=77, return_tensors="np")
+    input_ids = fg_prompt_tokens['input_ids'][0]
+    # index_to_last_with = np.max(np.where(input_ids == 593))
+    # index_to_last_eot = np.max(np.where(input_ids == 49407))
+    token_map = []
+    for ind, item in enumerate(input_ids.tolist()):
+        token = tokenizer._convert_id_to_token(item)
+        if verbose:
+            print(f"{ind}, {token} ({item})")
+        token_map.append(token)
+        # If we don't pad, we don't need to break.
+        # if item == tokenizer.eos_token_id:
+        #     break
+    return token_map
+def get_phrase_indices(tokenizer, prompt, phrases, verbose=False, words=None, include_eos=False, token_map=None, return_word_token_indices=False, add_suffix_if_not_found=False):
+    for obj in phrases:
+        # Suffix the prompt with object name for attention guidance if object is not in the prompt, using "|" to separate the prompt and the suffix
+        if obj not in prompt:
+            prompt += "| " + obj
+    if token_map is None:
+        # We allow using a pre-computed token map.
+        token_map = get_token_map(tokenizer, prompt=prompt, verbose=verbose, padding="do_not_pad")
+    token_map_str = " ".join(token_map)
+    object_positions = []
+    word_token_indices = []
+    for obj_ind, obj in enumerate(phrases):
+        phrase_token_map = get_token_map(tokenizer, prompt=obj, verbose=verbose, padding="do_not_pad")
+        # Remove <bos> and <eos> in substr
+        phrase_token_map = phrase_token_map[1:-1]
+        phrase_token_map_len = len(phrase_token_map)
+        phrase_token_map_str = " ".join(phrase_token_map)
+        if verbose:
+            print("Full str:", token_map_str, "Substr:", phrase_token_map_str, "Phrase:", phrases)
+        # Count the number of token before substr
+        # The substring comes with a trailing space that needs to be removed by minus one in the index.
+        obj_first_index = len(token_map_str[:token_map_str.index(phrase_token_map_str)-1].split(" "))
+        obj_position = list(range(obj_first_index, obj_first_index + phrase_token_map_len))
+        if include_eos:
+            obj_position.append(token_map.index(tokenizer.eos_token))
+        object_positions.append(obj_position)
+        if return_word_token_indices:
+            # Picking the last token in the specification
+            if words is None:
+                so_token_index = object_positions[0][-1]
+                # Picking the noun or perform pooling on attention with the tokens may be better
+                print(f"Picking the last token \"{token_map[so_token_index]}\" ({so_token_index}) as attention token for extracting attention for SAM, which might not be the right one")
+            else:
+                word = words[obj_ind]
+                word_token_map = get_token_map(tokenizer, prompt=word, verbose=verbose, padding="do_not_pad")
+                # Get the index of the last token of word (the occurrence in phrase) in the prompt. Note that we skip the <eos> token through indexing with -2.
+                so_token_index = obj_first_index + phrase_token_map.index(word_token_map[-2])
+            if verbose:
+                print("so_token_index:", so_token_index)
+            word_token_indices.append(so_token_index)
+    if return_word_token_indices:
+        if add_suffix_if_not_found:
+            return object_positions, word_token_indices, prompt
+        return object_positions, word_token_indices
+    if add_suffix_if_not_found:
+        return object_positions, prompt
+    return object_positions
+def add_ca_loss_per_attn_map_to_loss(loss, attn_map, object_number, bboxes, object_positions, use_ratio_based_loss=True, fg_top_p=0.2, bg_top_p=0.2, fg_weight=1.0, bg_weight=1.0, verbose=False):
+    """
+    fg_top_p, bg_top_p, fg_weight, and bg_weight are only used with max-based loss
+    """
+    # Uncomment to debug:
+    # print(fg_top_p, bg_top_p, fg_weight, bg_weight)
+    # b is the number of heads, not batch
+    b, i, j = attn_map.shape
+    H = W = int(math.sqrt(i))
+    for obj_idx in range(object_number):
+        obj_loss = 0
+        mask = torch.zeros(size=(H, W), device="cuda")
+        obj_boxes = bboxes[obj_idx]
+        # We support two level (one box per phrase) and three level (multiple boxes per phrase)
+        if not isinstance(obj_boxes[0], Iterable):
+            obj_boxes = [obj_boxes]
+        for obj_box in obj_boxes:
+            # x_min, y_min, x_max, y_max = int(obj_box[0] * W), int(obj_box[1] * H), int(obj_box[2] * W), int(obj_box[3] * H)
+            x_min, y_min, x_max, y_max = utils.scale_proportion(obj_box, H=H, W=W)
+            mask[y_min: y_max, x_min: x_max] = 1
+        for obj_position in object_positions[obj_idx]:
+            # Could potentially optimize to compute this for loop in batch.
+            # Could crop the ref cross attention before saving to save memory.
+            ca_map_obj = attn_map[:, :, obj_position].reshape(b, H, W)
+            if use_ratio_based_loss:
+                warnings.warn("Using ratio-based loss, which is deprecated. Max-based loss is recommended. The scale may be different.")
+                # Original loss function (ratio-based loss function)
+                # Enforces the attention to be within the mask only. Does not enforce within-mask distribution.
+                activation_value = (ca_map_obj * mask).reshape(b, -1).sum(dim=-1)/ca_map_obj.reshape(b, -1).sum(dim=-1)
+                obj_loss += torch.mean((1 - activation_value) ** 2)
+                # if verbose:
+                #     print(f"enforce attn to be within the mask loss: {torch.mean((1 - activation_value) ** 2).item():.2f}")
+            else:
+                # Max-based loss function
+                # shape: (b, H * W)
+                ca_map_obj = attn_map[:, :, obj_position] # .reshape(b, H, W)
+                k_fg = (mask.sum() * fg_top_p).long().clamp_(min=1)
+                k_bg = ((1 - mask).sum() * bg_top_p).long().clamp_(min=1)
+                mask_1d = mask.view(1, -1)
+                # Take the topk over spatial dimension, and then take the sum over heads dim
+                # The mean is over k_fg and k_bg dimension, so we don't need to sum and divide on our own.
+                obj_loss += (1 - (ca_map_obj * mask_1d).topk(k=k_fg).values.mean(dim=1)).sum(dim=0) * fg_weight
+                obj_loss += ((ca_map_obj * (1 - mask_1d)).topk(k=k_bg).values.mean(dim=1)).sum(dim=0) * bg_weight
+        loss += obj_loss / len(object_positions[obj_idx])
+    return loss
+def add_ref_ca_loss_per_attn_map_to_lossv2(loss, saved_attn, object_number, bboxes, object_positions, guidance_attn_keys, ref_ca_saved_attns, ref_ca_last_token_only, ref_ca_word_token_only, word_token_indices, index, loss_weight, eps=1e-5, verbose=False):
+    """
+    This adds the ca loss with ref. Note that this should be used with ca loss without ref since it only enforces the mse of the normalized ca between ref and target.
+    `ref_ca_saved_attn` should have the same structure as bboxes and object_positions (until the inner content, which should be similar to saved_attn).
+    """
+    if loss_weight == 0.:
+        # Skip computing the reference loss if the loss weight is 0.
+        return loss
+    for obj_idx in range(object_number):
+        obj_loss = 0
+        obj_boxes = bboxes[obj_idx]
+        obj_ref_ca_saved_attns = ref_ca_saved_attns[obj_idx]
+        # We support two level (one box per phrase) and three level (multiple boxes per phrase)
+        if not isinstance(obj_boxes[0], Iterable):
+            obj_boxes = [obj_boxes]
+            obj_ref_ca_saved_attns = [obj_ref_ca_saved_attns]
+        assert len(obj_boxes) == len(obj_ref_ca_saved_attns), f"obj_boxes: {len(obj_boxes)}, obj_ref_ca_saved_attns: {len(obj_ref_ca_saved_attns)}"
+        for obj_box, obj_ref_ca_saved_attn in zip(obj_boxes, obj_ref_ca_saved_attns):
+            # obj_ref_ca_map_items has all timesteps.
+            # Format: (timestep (index), attn_key, batch, heads, 2d dim, num text tokens (selected 1))
+            # Different from ca_loss without ref, which has one loss for all boxes for a phrase (a set of object positions), we have one loss per box.
+            # obj_ref_ca_saved_attn_items: select the timestep
+            obj_ref_ca_saved_attn = obj_ref_ca_saved_attn[index]
+            for attn_key in guidance_attn_keys:
+                attn_map = saved_attn[attn_key]
+                if not attn_map.is_cuda:
+                    attn_map = attn_map.cuda()
+                attn_map = attn_map.squeeze(dim=0)
+                obj_ref_ca_map = obj_ref_ca_saved_attn[attn_key]
+                if not obj_ref_ca_map.is_cuda:
+                    obj_ref_ca_map = obj_ref_ca_map.cuda()
+                # obj_ref_ca_map: (batch, heads, 2d dim, num text token)
+                # `squeeze` on `obj_ref_ca_map` is combined with the subsequent indexing
+                # b is the number of heads, not batch
+                b, i, j = attn_map.shape
+                H = W = int(math.sqrt(i))
+                # `obj_ref_ca_map` only has one text token (the 0 at the last dimension)
+                assert obj_ref_ca_map.ndim == 4, f"{obj_ref_ca_map.shape}"
+                obj_ref_ca_map = obj_ref_ca_map[0, :, :, 0]
+                # Same mask for all heads
+                obj_mask = torch.zeros(size=(H, W), device="cuda")
+                # x_min, y_min, x_max, y_max = int(obj_box[0] * W), int(obj_box[1] * H), int(obj_box[2] * W), int(obj_box[3] * H)
+                x_min, y_min, x_max, y_max = utils.scale_proportion(obj_box, H=H, W=W)
+                obj_mask[y_min: y_max, x_min: x_max] = 1
+                # keep 1d mask
+                obj_mask = obj_mask.reshape(1, -1)
+                # Optimize the loss over the last phrase token only (assuming the indices in `object_positions[obj_idx]` is sorted)
+                if ref_ca_word_token_only:
+                    object_positions_to_iterate = [word_token_indices[obj_idx]]
+                elif ref_ca_last_token_only:
+                    object_positions_to_iterate = [object_positions[obj_idx][-1]]
+                else:
+                    print(f"Applying attention transfer from one attention to all attention maps in object positions {object_positions[obj_idx]}, which is likely to be incorrect")
+                    object_positions_to_iterate = object_positions[obj_idx]
+                for obj_position in object_positions_to_iterate:
+                    ca_map_obj = attn_map[:, :, obj_position]
+                    ca_map_obj_masked = ca_map_obj * obj_mask
+                    # Add eps because the sum can be very small, causing NaN
+                    ca_map_obj_masked_normalized = ca_map_obj_masked / (ca_map_obj_masked.sum(dim=-1, keepdim=True) + eps)
+                    obj_ref_ca_map_masked = obj_ref_ca_map * obj_mask
+                    obj_ref_ca_map_masked_normalized = obj_ref_ca_map_masked / (obj_ref_ca_map_masked.sum(dim=-1, keepdim=True) + eps)
+                    # We found dividing by object mask size makes the loss too small. Since the normalized masked attn has mean value inversely proportional to the mask size, summing the values up spatially gives a relatively standard scale to add to other losses.
+                    activation_value = (torch.abs(ca_map_obj_masked_normalized - obj_ref_ca_map_masked_normalized)).sum(dim=-1)
+                    obj_loss += torch.mean(activation_value, dim=0)
+        # The normalization for len(obj_ref_ca_map_items) is at the outside of this function.
+        # Note that we assume we have at least one box for each object
+        loss += loss_weight * obj_loss / (len(obj_boxes) * len(object_positions_to_iterate))
+        if verbose:
+            print(f"reference cross-attention obj_loss: unweighted {obj_loss.item() / (len(obj_boxes) * len(object_positions[obj_idx])):.3f}, weighted {loss_weight * obj_loss.item() / (len(obj_boxes) * len(object_positions[obj_idx])):.3f}")
+    return loss
+def compute_ca_lossv3(saved_attn, bboxes, object_positions, guidance_attn_keys, ref_ca_saved_attns=None, ref_ca_last_token_only=True, ref_ca_word_token_only=False, word_token_indices=None, index=None, ref_ca_loss_weight=1.0, verbose=False, **kwargs):
+    """
+    v3 is equivalent to v2 but with new dictionary format for attention maps.
+    The `saved_attn` is supposed to be passed to `save_attn_to_dict` in `cross_attention_kwargs` prior to computing ths loss.
+    `AttnProcessor` will put attention maps into the `save_attn_to_dict`.
+    `index` is the timestep.
+    `ref_ca_word_token_only`: This has precedence over `ref_ca_last_token_only` (i.e., if both are enabled, we take the token from word rather than the last token).
+    `ref_ca_last_token_only`: `ref_ca_saved_attn` comes from the attention map of the last token of the phrase in single object generation, so we apply it only to the last token of the phrase in overall generation if this is set to True. If set to False, `ref_ca_saved_attn` will be applied to all the text tokens.
+    """
+    loss = torch.tensor(0).float().cuda()
+    object_number = len(bboxes)
+    if object_number == 0:
+        return loss
+    for attn_key in guidance_attn_keys:
+        # We only have 1 cross attention for mid.
+        attn_map_integrated = saved_attn[attn_key]
+        if not attn_map_integrated.is_cuda:
+            attn_map_integrated = attn_map_integrated.cuda()
+        # Example dimension: [20, 64, 77]
+        attn_map = attn_map_integrated.squeeze(dim=0)
+        loss = add_ca_loss_per_attn_map_to_loss(loss, attn_map, object_number, bboxes, object_positions, verbose=verbose, **kwargs)
+    num_attn = len(guidance_attn_keys)
+    if num_attn > 0:
+        loss = loss / (object_number * num_attn)
+    if ref_ca_saved_attns is not None:
+        ref_loss = torch.tensor(0).float().cuda()
+        ref_loss = add_ref_ca_loss_per_attn_map_to_lossv2(
+            ref_loss, saved_attn=saved_attn, object_number=object_number, bboxes=bboxes, object_positions=object_positions, guidance_attn_keys=guidance_attn_keys,
+            ref_ca_saved_attns=ref_ca_saved_attns, ref_ca_last_token_only=ref_ca_last_token_only, ref_ca_word_token_only=ref_ca_word_token_only, word_token_indices=word_token_indices, verbose=verbose, index=index, loss_weight=ref_ca_loss_weight
+        )
+        num_attn = len(guidance_attn_keys)
+        if verbose:
+            print(f"loss {loss.item():.3f}, reference attention loss (weighted) {ref_loss.item() / (object_number * num_attn):.3f}")
+        loss += ref_loss / (object_number * num_attn)
+    return loss
+# For compatibility
+def add_ref_ca_loss_per_attn_map_to_loss(loss, attn_maps, object_number, bboxes, object_positions, ref_ca_maps, stage_id, index, verbose=False):
+    """
+    This adds the ca loss with ref. Note that this should be used with ca loss without ref since it only enforces the mse of the normalized ca between ref and target.
+    ref_ca_maps should have the same structure as bboxes and object_positions.
+    """
+    # attn_map_items is all cond ca maps for current down/mid/up for the overall generation.
+    attn_map_items = attn_maps[stage_id]
+    for obj_idx in range(object_number):
+        obj_loss = 0
+        obj_boxes = bboxes[obj_idx]
+        obj_ref_ca_maps = ref_ca_maps[obj_idx]
+        # We support two level (one box per phrase) and three level (multiple boxes per phrase)
+        if not isinstance(obj_boxes[0], Iterable):
+            obj_boxes = [obj_boxes]
+            obj_ref_ca_maps = [obj_ref_ca_maps]
+        assert len(obj_boxes) == len(obj_ref_ca_maps), f"obj_boxes: {len(obj_boxes)}, obj_ref_ca_maps: {len(obj_ref_ca_maps)}"
+        for obj_box, obj_ref_ca_map_items in zip(obj_boxes, obj_ref_ca_maps):
+            # obj_ref_ca_map_items format: (stage, timestep (index), block, batch, heads, 2d dim, num text tokens (selected 1))
+            # Different from ca_loss without ref, which has one loss for all boxes for a phrase (a set of object positions), we have one loss per box.
+            # print(len(obj_ref_ca_map_items), obj_ref_ca_map_items[stage_id].shape)
+            # Mid example: 1 torch.Size([50, 1, 1, 8, 64, 1])
+            # Up example: 3 torch.Size([50, 3, 1, 8, 256, 1])
+            # obj_ref_ca_map_items is all cond ca maps for current down/mid/up for the single object generation.
+            obj_ref_ca_map_items = obj_ref_ca_map_items[stage_id][index]
+            for attn_map, obj_ref_ca_map in zip(attn_map_items, obj_ref_ca_map_items):
+                attn_map = attn_map.squeeze(dim=0)
+                # b is the number of heads, not batch
+                b, i, j = attn_map.shape
+                H = W = int(math.sqrt(i))
+                # obj_ref_ca_map only has one text token (the 0 at the last dimension)
+                assert obj_ref_ca_map.ndim == 4, f"{obj_ref_ca_map.ndim}"
+                obj_ref_ca_map = obj_ref_ca_map[0, :, :, 0]
+                # Same mask for all heads
+                obj_mask = torch.zeros(size=(H, W), device="cuda")
+                x_min, y_min, x_max, y_max = int(obj_box[0] * W), \
+                    int(obj_box[1] * H), int(obj_box[2] * W), int(obj_box[3] * H)
+                obj_mask[y_min: y_max, x_min: x_max] = 1
+                # keep 1d mask
+                obj_mask = obj_mask.reshape(1, -1)
+                for obj_position in object_positions[obj_idx]:
+                    ca_map_obj = attn_map[:, :, obj_position]
+                    ca_map_obj_masked = ca_map_obj * obj_mask
+                    obj_ref_ca_map_masked = obj_ref_ca_map * obj_mask
+                    # We found dividing by object mask size makes the loss too small. Since the normalized masked attn has mean value inversely proportional to the mask size, summing the values up spatially gives a relatively standard scale to add to other losses.
+                    activation_value = (torch.abs(ca_map_obj_masked / ca_map_obj_masked.sum(dim=-1, keepdim=True) - obj_ref_ca_map_masked / obj_ref_ca_map_masked.sum(dim=-1, keepdim=True))).sum(dim=-1) # / obj_mask.sum()
+                    obj_loss += torch.mean(activation_value, dim=0)
+        # The normalization for len(obj_ref_ca_map_items) is at the outside of this function.
+        loss += obj_loss / (len(obj_boxes) * len(object_positions[obj_idx]))
+        if verbose:
+            print(f"reference cross-attention obj_loss: {obj_loss.item() / (len(obj_boxes) * len(object_positions[obj_idx])):.3f}")
+    return loss

utils/latents.py CHANGED Viewed

@@ -44,9 +44,10 @@ def compose_latents(model_dict, latents_all_list, mask_tensor_list, num_inferenc
     # Other than t=T (idx=0), we only have masked latents. This is to prevent accidentally loading from non-masked part. Use same mask as the one used to compose the latents.
     if use_fast_schedule:
-        # If we use fast schedule, we only need to compose the frozen steps.
         composed_latents = torch.zeros((fast_after_steps + 1, *latents_bg.shape), dtype=dtype)
     else:
         composed_latents = torch.zeros((num_inference_steps + 1, *latents_bg.shape), dtype=dtype)
     composed_latents[0] = latents_bg
@@ -73,7 +74,7 @@ def compose_latents(model_dict, latents_all_list, mask_tensor_list, num_inferenc
         latents_all, mask_tensor = latents_all_list[mask_idx], mask_tensor_list[mask_idx]
         foreground_indices = foreground_indices * (~mask_tensor) + (mask_idx + 1) * mask_tensor
         mask_tensor_expanded = mask_tensor[None, None, None, ...].to(dtype)
-        composed_latents = composed_latents * (1. - mask_tensor_expanded) + latents_all * mask_tensor_expanded
     composed_latents, foreground_indices = composed_latents.to(torch_device), foreground_indices.to(torch_device)
     return composed_latents, foreground_indices

     # Other than t=T (idx=0), we only have masked latents. This is to prevent accidentally loading from non-masked part. Use same mask as the one used to compose the latents.
     if use_fast_schedule:
+        # If we use fast schedule, we only compose the frozen steps because the later steps do not match.
         composed_latents = torch.zeros((fast_after_steps + 1, *latents_bg.shape), dtype=dtype)
     else:
+        # Otherwise we compose all steps so that we don't need to compose again if we change the frozen steps.
         composed_latents = torch.zeros((num_inference_steps + 1, *latents_bg.shape), dtype=dtype)
     composed_latents[0] = latents_bg
         latents_all, mask_tensor = latents_all_list[mask_idx], mask_tensor_list[mask_idx]
         foreground_indices = foreground_indices * (~mask_tensor) + (mask_idx + 1) * mask_tensor
         mask_tensor_expanded = mask_tensor[None, None, None, ...].to(dtype)
+        composed_latents = composed_latents * (1. - mask_tensor_expanded) + latents_all[:fast_after_steps + 1] * mask_tensor_expanded
     composed_latents, foreground_indices = composed_latents.to(torch_device), foreground_indices.to(torch_device)
     return composed_latents, foreground_indices

utils/parse.py CHANGED Viewed

@@ -1,33 +1,39 @@
 import ast
-import os
-import json
 from matplotlib.patches import Polygon
 from matplotlib.collections import PatchCollection
 import matplotlib.pyplot as plt
 import numpy as np
-import cv2
 import inflect
 p = inflect.engine()
 img_dir = "imgs"
 bg_prompt_text = "Background prompt: "
 # h, w
 box_scale = (512, 512)
 size = box_scale
 size_h, size_w = size
 print(f"Using box scale: {box_scale}")
 def parse_input(text=None, no_input=False):
     if not text:
         if no_input:
             return
         text = input("Enter the response: ")
-    if "Objects: " in text:
-        text = text.split("Objects: ")[1]
-    text_split = text.split(bg_prompt_text)
     if len(text_split) == 2:
         gen_boxes, bg_prompt = text_split
     elif len(text_split) == 1:
@@ -38,8 +44,8 @@ def parse_input(text=None, no_input=False):
         while not bg_prompt:
             # Ignore the empty lines in the response
             bg_prompt = input("Enter the background prompt: ").strip()
-        if bg_prompt_text in bg_prompt:
-            bg_prompt = bg_prompt.split(bg_prompt_text)[1]
     else:
         raise ValueError(f"text: {text}")
     try:
@@ -54,7 +60,70 @@ def parse_input(text=None, no_input=False):
     return gen_boxes, bg_prompt
 def filter_boxes(gen_boxes, scale_boxes=True, ignore_background=True, max_scale=3):
     if len(gen_boxes) == 0:
         return []
@@ -62,9 +131,13 @@ def filter_boxes(gen_boxes, scale_boxes=True, ignore_background=True, max_scale=
     gen_boxes_new = []
     for gen_box in gen_boxes:
         if isinstance(gen_box, dict):
             name, [bbox_x, bbox_y, bbox_w, bbox_h] = gen_box['name'], gen_box['bounding_box']
             box_dict_format = True
         else:
             name, [bbox_x, bbox_y, bbox_w, bbox_h] = gen_box
         if bbox_w <= 0 or bbox_h <= 0:
             # Empty boxes
@@ -73,6 +146,12 @@ def filter_boxes(gen_boxes, scale_boxes=True, ignore_background=True, max_scale=
             if (bbox_w >= size[1] and bbox_h >= size[0]) or bbox_x > size[1] or bbox_y > size[0]:
                 # Ignore the background boxes
                 continue
         gen_boxes_new.append(gen_box)
     gen_boxes = gen_boxes_new
@@ -99,9 +178,11 @@ def filter_boxes(gen_boxes, scale_boxes=True, ignore_background=True, max_scale=
     # Used if scale_boxes is True
     shift = -bbox_left_x_min
-    scale = size_w / (bbox_right_x_max - bbox_left_x_min)
-    scale = min(scale, max_scale)
     for gen_box in gen_boxes:
         if box_dict_format:
@@ -165,7 +246,7 @@ def draw_boxes(anns):
     ax.add_collection(p)
-def show_boxes(gen_boxes, bg_prompt=None, ind=None, show=False):
     if len(gen_boxes) == 0:
         return
@@ -183,7 +264,7 @@ def show_boxes(gen_boxes, bg_prompt=None, ind=None, show=False):
     if bg_prompt is not None:
         ax = plt.gca()
-        ax.text(0, 0, bg_prompt, style='italic',
                 bbox={'facecolor': 'white', 'alpha': 0.7, 'pad': 5})
         c = (np.zeros((1, 3)))
@@ -200,12 +281,6 @@ def show_boxes(gen_boxes, bg_prompt=None, ind=None, show=False):
     draw_boxes(anns)
     if show:
         plt.show()
-    else:
-        print("Saved to", f"{img_dir}/boxes.png", f"ind: {ind}")
-        if ind is not None:
-            plt.savefig(f"{img_dir}/boxes_{ind}.png")
-        plt.savefig(f"{img_dir}/boxes.png")
 def show_masks(masks):
     masks_to_show = np.zeros((*size, 3), dtype=np.float32)

 import ast
 from matplotlib.patches import Polygon
 from matplotlib.collections import PatchCollection
 import matplotlib.pyplot as plt
 import numpy as np
+import warnings
 import inflect
 p = inflect.engine()
 img_dir = "imgs"
+objects_text = "Objects: "
 bg_prompt_text = "Background prompt: "
+bg_prompt_text_no_trailing_space = bg_prompt_text.rstrip()
+neg_prompt_text = "Negative prompt: "
+neg_prompt_text_no_trailing_space = neg_prompt_text.rstrip()
 # h, w
 box_scale = (512, 512)
 size = box_scale
 size_h, size_w = size
 print(f"Using box scale: {box_scale}")
 def parse_input(text=None, no_input=False):
+    warnings.warn("Parsing input without negative prompt is deprecated.")
     if not text:
         if no_input:
             return
         text = input("Enter the response: ")
+    if objects_text in text:
+        text = text.split(objects_text)[1]
+    text_split = text.split(bg_prompt_text_no_trailing_space)
     if len(text_split) == 2:
         gen_boxes, bg_prompt = text_split
     elif len(text_split) == 1:
         while not bg_prompt:
             # Ignore the empty lines in the response
             bg_prompt = input("Enter the background prompt: ").strip()
+        if bg_prompt_text_no_trailing_space in bg_prompt:
+            bg_prompt = bg_prompt.split(bg_prompt_text_no_trailing_space)[1]
     else:
         raise ValueError(f"text: {text}")
     try:
     return gen_boxes, bg_prompt
+def parse_input_with_negative(text=None, no_input=False):
+    # no_input: should not request interactive input
+    if not text:
+        if no_input:
+            return
+        text = input("Enter the response: ")
+    if objects_text in text:
+        text = text.split(objects_text)[1]
+    text_split = text.split(bg_prompt_text_no_trailing_space)
+    if len(text_split) == 2:
+        gen_boxes, text_rem = text_split
+    elif len(text_split) == 1:
+        if no_input:
+            return
+        gen_boxes = text
+        text_rem = ""
+        while not text_rem:
+            # Ignore the empty lines in the response
+            text_rem = input("Enter the background prompt: ").strip()
+        if bg_prompt_text_no_trailing_space in text_rem:
+            text_rem = text_rem.split(bg_prompt_text_no_trailing_space)[1]
+    else:
+        raise ValueError(f"text: {text}")
+    text_split = text_rem.split(neg_prompt_text_no_trailing_space)
+    if len(text_split) == 2:
+        bg_prompt, neg_prompt = text_split
+    elif len(text_split) == 1:
+        bg_prompt = text_rem
+        # Negative prompt is optional: if it's not provided, we default to empty string
+        neg_prompt = ""
+        if not no_input:
+            # Ignore the empty lines in the response
+            neg_prompt = input("Enter the negative prompt: ").strip()
+            if neg_prompt_text_no_trailing_space in neg_prompt:
+                neg_prompt = neg_prompt.split(neg_prompt_text_no_trailing_space)[1]
+    else:
+        raise ValueError(f"text: {text}")
+    try:
+        gen_boxes = ast.literal_eval(gen_boxes)
+    except SyntaxError as e:
+        # Sometimes the response is in plain text
+        if "No objects" in gen_boxes or gen_boxes.strip() == "":
+            gen_boxes = []
+        else:
+            raise e
+    bg_prompt = bg_prompt.strip()
+    neg_prompt = neg_prompt.strip()
+    # LLM may return "None" to mean no negative prompt provided.
+    if neg_prompt == "None":
+        neg_prompt = ""
+    return gen_boxes, bg_prompt, neg_prompt
 def filter_boxes(gen_boxes, scale_boxes=True, ignore_background=True, max_scale=3):
+    if gen_boxes is None:
+        return []
     if len(gen_boxes) == 0:
         return []
     gen_boxes_new = []
     for gen_box in gen_boxes:
         if isinstance(gen_box, dict):
+            if not gen_box['bounding_box']:
+                continue
             name, [bbox_x, bbox_y, bbox_w, bbox_h] = gen_box['name'], gen_box['bounding_box']
             box_dict_format = True
         else:
+            if not gen_box[1]:
+                continue
             name, [bbox_x, bbox_y, bbox_w, bbox_h] = gen_box
         if bbox_w <= 0 or bbox_h <= 0:
             # Empty boxes
             if (bbox_w >= size[1] and bbox_h >= size[0]) or bbox_x > size[1] or bbox_y > size[0]:
                 # Ignore the background boxes
                 continue
+        if bbox_x < 0 or bbox_y < 0 or bbox_x + bbox_w > size[1] or bbox_y + bbox_h > size[0]:
+            # Out of bounds boxes exist: we need to scale and shift all the boxes
+            print(f"**Some boxes are out of bounds: {gen_box}, scaling all the boxes to fit**")
+            scale_boxes = True
         gen_boxes_new.append(gen_box)
     gen_boxes = gen_boxes_new
     # Used if scale_boxes is True
     shift = -bbox_left_x_min
+    # Make sure the boxes fit horizontally and vertically
+    scale_w = size_w / (bbox_right_x_max - bbox_left_x_min)
+    scale_h = size_h / (bbox_bottom_y_max - bbox_top_y_min)
+    scale = min(scale_w, scale_h, max_scale)
     for gen_box in gen_boxes:
         if box_dict_format:
     ax.add_collection(p)
+def show_boxes(gen_boxes, bg_prompt=None, neg_prompt=None, ind=None, show=False):
     if len(gen_boxes) == 0:
         return
     if bg_prompt is not None:
         ax = plt.gca()
+        ax.text(0, 0, bg_prompt + f"(Neg: {neg_prompt})" if neg_prompt else bg_prompt, style='italic',
                 bbox={'facecolor': 'white', 'alpha': 0.7, 'pad': 5})
         c = (np.zeros((1, 3)))
     draw_boxes(anns)
     if show:
         plt.show()
 def show_masks(masks):
     masks_to_show = np.zeros((*size, 3), dtype=np.float32)

utils/utils.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import torch
 from PIL import ImageDraw
 import numpy as np
-import os
 import gc
 torch_device = "cuda" if torch.cuda.is_available() else "cpu"

 import torch
 from PIL import ImageDraw
 import numpy as np
 import gc
 torch_device = "cuda" if torch.cuda.is_available() else "cpu"

utils/vis.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import matplotlib.pyplot as plt
+import math
+import utils
+from . import parse
+save_ind = 0
+def visualize(image, title, colorbar=False, show_plot=True, **kwargs):
+    plt.title(title)
+    plt.imshow(image, **kwargs)
+    if colorbar:
+        plt.colorbar()
+    if show_plot:
+        plt.show()
+def visualize_arrays(image_title_pairs, colorbar_index=-1, show_plot=True, figsize=None, **kwargs):
+    if figsize is not None:
+        plt.figure(figsize=figsize)
+    num_subplots = len(image_title_pairs)
+    for idx, image_title_pair in enumerate(image_title_pairs):
+        plt.subplot(1, num_subplots, idx+1)
+        if isinstance(image_title_pair, (list, tuple)):
+            image, title = image_title_pair
+        else:
+            image, title = image_title_pair, None
+        if title is not None:
+            plt.title(title)
+        plt.imshow(image, **kwargs)
+        if idx == colorbar_index:
+            plt.colorbar()
+    if show_plot:
+        plt.show()
+def visualize_masked_latents(latents_all, masked_latents, timestep_T=False, timestep_0=True):
+    if timestep_T:
+        # from T to 0
+        latent_idx = 0
+        plt.subplot(1, 2, 1)
+        plt.title("latents_all (t=T)")
+        plt.imshow((latents_all[latent_idx, 0, :3].cpu().permute(1,2,0).numpy().astype(float) / 1.5).clip(0., 1.), cmap="gray")
+        plt.subplot(1, 2, 2)
+        plt.title("mask latents (t=T)")
+        plt.imshow((masked_latents[latent_idx, 0, :3].cpu().permute(1,2,0).numpy().astype(float) / 1.5).clip(0., 1.), cmap="gray")
+        plt.show()
+    if timestep_0:
+        latent_idx = -1
+        plt.subplot(1, 2, 1)
+        plt.title("latents_all (t=0)")
+        plt.imshow((latents_all[latent_idx, 0, :3].cpu().permute(1,2,0).numpy().astype(float) / 1.5).clip(0., 1.), cmap="gray")
+        plt.subplot(1, 2, 2)
+        plt.title("mask latents (t=0)")
+        plt.imshow((masked_latents[latent_idx, 0, :3].cpu().permute(1,2,0).numpy().astype(float) / 1.5).clip(0., 1.), cmap="gray")
+        plt.show()
+# This function has not been adapted to new `saved_attn`.
+def visualize_attn(token_map, cross_attention_probs_tensors, stage_id, block_id, visualize_step_start=10, input_ca_has_condition_only=False):
+    """
+    Visualize cross attention: `stage_id`th downsampling block, mean over all timesteps starting from step start, `block_id`th Transformer block, second item (conditioned), mean over heads, show each token
+    cross_attention_probs_tensors:
+    One of `cross_attention_probs_down_tensors`, `cross_attention_probs_mid_tensors`, and `cross_attention_probs_up_tensors`
+    stage_id: index of downsampling/mid/upsaming block
+    block_id: index of the transformer block
+    """
+    plt.figure(figsize=(20, 8))
+    for token_id in range(len(token_map)):
+        token = token_map[token_id]
+        plt.subplot(1, len(token_map), token_id + 1)
+        plt.title(token)
+        attn = cross_attention_probs_tensors[stage_id][visualize_step_start:].mean(dim=0)[block_id]
+        if not input_ca_has_condition_only:
+            assert attn.shape[0] == 2, f"Expect to have 2 items (uncond and cond), but found {attn.shape[0]} items"
+            attn = attn[1]
+        else:
+            assert attn.shape[0] == 1, f"Expect to have 1 item (cond only), but found {attn.shape[0]} items"
+            attn = attn[0]
+        attn = attn.mean(dim=0)[:, token_id]
+        H = W = int(math.sqrt(attn.shape[0]))
+        attn = attn.reshape((H, W))
+        plt.imshow(attn.cpu().numpy())
+    plt.show()
+# This function has not been adapted to new `saved_attn`.
+def visualize_across_timesteps(token_id, cross_attention_probs_tensors, stage_id, block_id, visualize_step_start=10, input_ca_has_condition_only=False):
+    """
+    Visualize cross attention for one token, across timesteps: `stage_id`th downsampling block, mean over all timesteps starting from step start, `block_id`th Transformer block, second item (conditioned), mean over heads, show each token
+    cross_attention_probs_tensors:
+    One of `cross_attention_probs_down_tensors`, `cross_attention_probs_mid_tensors`, and `cross_attention_probs_up_tensors`
+    stage_id: index of downsampling/mid/upsaming block
+    block_id: index of the transformer block
+    `visualize_step_start` is not used. We visualize all timesteps.
+    """
+    plt.figure(figsize=(50, 8))
+    attn_stage = cross_attention_probs_tensors[stage_id]
+    num_inference_steps = attn_stage.shape[0]
+    for t in range(num_inference_steps):
+        plt.subplot(1, num_inference_steps, t + 1)
+        plt.title(f"t: {t}")
+        attn = attn_stage[t][block_id]
+        if not input_ca_has_condition_only:
+            assert attn.shape[0] == 2, f"Expect to have 2 items (uncond and cond), but found {attn.shape[0]} items"
+            attn = attn[1]
+        else:
+            assert attn.shape[0] == 1, f"Expect to have 1 item (cond only), but found {attn.shape[0]} items"
+            attn = attn[0]
+        attn = attn.mean(dim=0)[:, token_id]
+        H = W = int(math.sqrt(attn.shape[0]))
+        attn = attn.reshape((H, W))
+        plt.imshow(attn.cpu().numpy())
+        plt.axis("off")
+        plt.tight_layout()
+    plt.show()
+def visualize_bboxes(bboxes, H, W):
+    num_boxes = len(bboxes)
+    for ind, bbox in enumerate(bboxes):
+        plt.subplot(1, num_boxes, ind + 1)
+        fg_mask = utils.proportion_to_mask(bbox, H, W)
+        plt.title(f"transformed bbox ({ind})")
+        plt.imshow(fg_mask.cpu().numpy())
+    plt.show()
+def display(image, save_prefix="", ind=None):
+    global save_ind
+    if save_prefix != "":
+        save_prefix = save_prefix + "_"
+    ind = f"{ind}_" if ind is not None else ""
+    path = f"{parse.img_dir}/{save_prefix}{ind}{save_ind}.png"
+    print(f"Saved to {path}")
+    image.save(path)
+    save_ind = save_ind + 1