Spaces:

dbaranchuk
/

iCD-image-editing

Runtime error

App Files Files Community

dbaranchuk commited on Jun 27

Commit

bf363c0

•

1 Parent(s): 21d434e

Main update

Browse files

Files changed (7) hide show

README.md +1 -1
app.py +364 -91
generation.py +621 -0
inversion.py +104 -0
p2p.py +454 -0
requirements.txt +3 -1
seq_aligner.py +181 -0

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: ICD Image Editing
 emoji: 🖼
 colorFrom: purple
 colorTo: red

 ---
+title: Demo App Editing
 emoji: 🖼
 colorFrom: purple
 colorTo: red

app.py CHANGED Viewed

@@ -1,52 +1,141 @@
 import gradio as gr
 import numpy as np
 import random
-from diffusers import DiffusionPipeline
 import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
-if torch.cuda.is_available():
-    torch.cuda.max_memory_allocated(device=device)
-    pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16", use_safetensors=True)
-    pipe.enable_xformers_memory_efficient_attention()
-    pipe = pipe.to(device)
-else:
-    pipe = DiffusionPipeline.from_pretrained("stabilityai/sdxl-turbo", use_safetensors=True)
-    pipe = pipe.to(device)
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024
-def infer(prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    image = pipe(
-        prompt = prompt,
-        negative_prompt = negative_prompt,
-        guidance_scale = guidance_scale,
-        num_inference_steps = num_inference_steps,
-        width = width,
-        height = height,
-        generator = generator
-    ).images[0]
-    return image
-examples = [
-    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
-    "An astronaut riding a green horse",
-    "A delicious ceviche cheesecake slice",
-]
 css="""
 #col-container {
     margin: 0 auto;
-    max-width: 520px;
 }
 """
@@ -58,89 +147,273 @@ else:
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
-        gr.Markdown(f"""
-        # Text-to-Image Gradio Template
-        Currently running on {power_device}.
-        """)
         with gr.Row():
-            prompt = gr.Text(
-                label="Prompt",
-                show_label=False,
                 max_lines=1,
                 placeholder="Enter your prompt",
-                container=False,
             )
-            run_button = gr.Button("Run", scale=0)
-        result = gr.Image(label="Result", show_label=False)
-        with gr.Accordion("Advanced Settings", open=False):
-            negative_prompt = gr.Text(
-                label="Negative prompt",
                 max_lines=1,
-                placeholder="Enter a negative prompt",
-                visible=False,
             )
-            seed = gr.Slider(
-                label="Seed",
-                minimum=0,
-                maximum=MAX_SEED,
-                step=1,
-                value=0,
-            )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
             with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=512,
                 )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=512,
                 )
             with gr.Row():
-                guidance_scale = gr.Slider(
-                    label="Guidance scale",
                     minimum=0.0,
-                    maximum=10.0,
                     step=0.1,
-                    value=0.0,
                 )
-                num_inference_steps = gr.Slider(
-                    label="Number of inference steps",
-                    minimum=1,
-                    maximum=12,
-                    step=1,
-                    value=2,
                 )
-        gr.Examples(
-            examples = examples,
-            inputs = [prompt]
-        )
     run_button.click(
         fn = infer,
-        inputs = [prompt, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps],
         outputs = [result]
     )
-demo.queue().launch()

+import spaces
 import gradio as gr
 import numpy as np
 import random
 import torch
+from diffusers import DDPMScheduler, StableDiffusionPipeline, DDIMScheduler, UNet2DConditionModel
+import p2p, generation, inversion
+model_id = 'runwayml/stable-diffusion-v1-5'
+dtype=torch.float16
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Reverse
+# -----------------------------
+pipe_reverse = StableDiffusionPipeline.from_pretrained(model_id,
+                                                       scheduler=DDIMScheduler.from_pretrained(model_id,
+                                                                               subfolder="scheduler"),
+                                                       ).to(device=device, dtype=dtype)
+unet = UNet2DConditionModel.from_pretrained("dbaranchuk/sd15-cfg-distill-unet").to(device)
+pipe_reverse.unet = unet
+pipe_reverse.load_lora_weights("dbaranchuk/icd-lora-sd15",
+                               weight_name='reverse-259-519-779-999.safetensors')
+pipe_reverse.fuse_lora()
+pipe_reverse.to(device)
+# -----------------------------
+# Forward
+# -----------------------------
+pipe_forward = StableDiffusionPipeline.from_pretrained(model_id,
+                                                       scheduler=DDIMScheduler.from_pretrained(model_id,
+                                                                               subfolder="scheduler"),
+                                                       ).to(device=device, dtype=dtype)
+unet = UNet2DConditionModel.from_pretrained("dbaranchuk/sd15-cfg-distill-unet").to(device)
+pipe_forward.unet = unet
+pipe_forward.load_lora_weights("dbaranchuk/icd-lora-sd15",
+                               weight_name='forward-19-259-519-779.safetensors')
+pipe_forward.fuse_lora()
+pipe_forward.to(device)
+# -----------------------------
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024
+@spaces.GPU(duration=30)
+def infer(image_path, input_prompt, edited_prompt, guidance, tau,
+          crs, srs, amplify_factor, amplify_word,
+          blend_orig, blend_edited, is_replacement):
+    tokenizer = pipe_forward.tokenizer
+    noise_scheduler = DDPMScheduler.from_pretrained(
+                      "runwayml/stable-diffusion-v1-5", subfolder="scheduler", )
+    NUM_REVERSE_CONS_STEPS = 4
+    REVERSE_TIMESTEPS = [259, 519, 779, 999]
+    NUM_FORWARD_CONS_STEPS = 4
+    FORWARD_TIMESTEPS = [19, 259, 519, 779]
+    NUM_DDIM_STEPS = 50
+    solver = generation.Generator(
+    model=pipe_forward,
+    noise_scheduler=noise_scheduler,
+    n_steps=NUM_DDIM_STEPS,
+    forward_cons_model=pipe_forward,
+    forward_timesteps=FORWARD_TIMESTEPS,
+    reverse_cons_model=pipe_reverse,
+    reverse_timesteps=REVERSE_TIMESTEPS,
+    num_endpoints=NUM_REVERSE_CONS_STEPS,
+    num_forward_endpoints=NUM_FORWARD_CONS_STEPS,
+    max_forward_timestep_index=49,
+    start_timestep=19)
+    p2p.NUM_DDIM_STEPS = NUM_DDIM_STEPS
+    p2p.tokenizer = tokenizer
+    p2p.device = 'cuda'
+    prompt = [input_prompt]
+    (image_gt, image_rec), ddim_latent, uncond_embeddings = inversion.invert(
+         # Playing params
+         image_path=image_path,
+         prompt=prompt,
+         # Fixed params
+         is_cons_inversion=True,
+         w_embed_dim=512,
+         inv_guidance_scale=0.0,
+         stop_step=50,
+         solver=solver,
+         seed=10500)
+    p2p.NUM_DDIM_STEPS = 4
+    p2p.tokenizer = tokenizer
+    p2p.device = 'cuda'
+    prompts = [input_prompt,
+               edited_prompt
+              ]
+    # Playing params
+    cross_replace_steps = {'default_': crs, }
+    self_replace_steps = srs
+    blend_word = (((blend_orig,), (blend_edited,)))
+    eq_params = {"words": (amplify_word,), "values": (amplify_factor,)}
+    controller = p2p.make_controller(prompts,
+                                     is_replacement, # (is_replacement) True if only one word is changed
+                                     cross_replace_steps,
+                                     self_replace_steps,
+                                     blend_word,
+                                     eq_params)
+    tau = tau
+    image, _ = generation.runner(
+         # Playing params
+         guidance_scale=guidance-1,
+         tau1=tau,  # Dynamic guidance if tau < 1.0
+         tau2=tau,
+         # Fixed params
+         model=pipe_reverse,
+         is_cons_forward=True,
+         w_embed_dim=512,
+         solver=solver,
+         prompt=prompts,
+         controller=controller,
+         num_inference_steps=50,
+         generator=None,
+         latent=ddim_latent,
+         uncond_embeddings=uncond_embeddings,
+         return_type='image')
+    image = generation.to_pil_images(image[1, :, :, :])
+    return image
 css="""
 #col-container {
     margin: 0 auto;
+    max-width: 1024px;
 }
 """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
+        gr.Markdown(
+            f"""
+        # ⚡ Invertible Consistency Distillation ⚡
+        # ⚡ Text-guided image editing with 8-step iCD-SD1.5 ⚡
+        This is a demo for [Invertible Consistency Distillation](https://yandex-research.github.io/invertible-cd/),
+        a diffusion distillation method proposed in [Invertible Consistency Distillation for Text-Guided Image Editing in Around 7 Steps](https://arxiv.org/abs/2406.14539)
+        by [Yandex Research](https://github.com/yandex-research).
+        Currently running on {power_device}
+        """
+        )
+        gr.Markdown(
+            "**Please** check the examples to catch the intuition behind the hyperparameters, which are quite important for successful editing. A short description: <br />1. *Dynamic guidance tau*. Controls the interval where guidance is applied: if t < tau, then guidance is turned on for t < tau."
+            " Lower tau values provide better reference preservation. We commonly use tau=0.6 and tau=0.8. <br />"
+            "2. *Cross replace steps (crs)* and *self replace steps (srs)*. Controls the time step interval "
+            "where the cross- and self-attention maps are replaced. Higher values lead to better preservation of the reference image. "
+            "The optimal values depend on the particular image. "
+            "Mostly, we use crs and srs from 0.2 to 0.6. <br />"
+            "3. *Amplify word* and *Amplify factor*. Define the word that needs to be enhanced in the edited image. <br />"
+            "4. *Blended word*. Specifies the object used for making local edits. That is, edit only selected objects. <br />"
+            "5. *Is replacement*. You can set True, if you replace only one word in the original prompt. But False also works in these cases."
+        )
+        gr.Markdown(
+            "Feel free to check out our [image generation demo](https://huggingface.co/spaces/dbaranchuk/demo-app) as well."
+        )
+        gr.Markdown(
+            "If you enjoy the space, feel free to give a ⭐ to the <a href='https://github.com/yandex-research/invertible-cd' target='_blank'>Github Repo</a>. [![GitHub Stars](https://img.shields.io/github/stars/yandex-research/invertible-cd?style=social)](https://github.com/yandex-research/invertible-cd)"
+        )
         with gr.Row():
+            input_prompt = gr.Text(
+                label="Origial prompt",
                 max_lines=1,
                 placeholder="Enter your prompt",
             )
+            prompt = gr.Text(
+                label="Edited prompt",
                 max_lines=1,
+                placeholder="Enter your prompt",
             )
+        with gr.Row():
+            with gr.Column():
+                input_image = gr.Image(label="Input image", height=512, width=512, show_label=False)
+            with gr.Column():
+                result = gr.Image(label="Result", height=512, width=512, show_label=False)
+        with gr.Accordion("Advanced Settings", open=True):
             with gr.Row():
+                guidance_scale = gr.Slider(
+                    label="Guidance scale",
+                    minimum=1.0,
+                    maximum=20.0,
+                    step=1.0,
+                    value=20.0,
                 )
+                tau = gr.Slider(
+                    label="Dynamic guidance tau",
+                    minimum=0.0,
+                    maximum=1.0,
+                    step=0.2,
+                    value=0.8,
                 )
             with gr.Row():
+                crs = gr.Slider(
+                    label="Cross replace steps",
                     minimum=0.0,
+                    maximum=1.0,
                     step=0.1,
+                    value=0.4
                 )
+                srs = gr.Slider(
+                    label="Self replace steps",
+                    minimum=0.0,
+                    maximum=1.0,
+                    step=0.1,
+                    value=0.4,
                 )
+            with gr.Row():
+                amplify_word = gr.Text(
+                   label="Amplify word",
+                    max_lines=1,
+                   placeholder="Enter your word",
+                 )
+                amplify_factor = gr.Slider(
+                    label="Amplify factor",
+                    minimum=0.0,
+                    maximum=30,
+                    step=1.0,
+                    value=1,
+                )
+            with gr.Row():
+                blend_orig = gr.Text(
+                   label="Blended word 1",
+                    max_lines=1,
+                   placeholder="Enter your word",)
+                blend_edited = gr.Text(
+                   label="Blended word 2",
+                    max_lines=1,
+                   placeholder="Enter your word",)
+            with gr.Row():
+                is_replacement = gr.Checkbox(label="Is replacement?", value=False)
+        with gr.Row():
+            run_button = gr.Button("Edit", scale=0)
+        with gr.Row():
+            examples = [
+                [
+                    "examples/orig_3.jpg", #input_image
+                    "a photo of a basket of apples", #src_prompt
+                    "a photo of a basket of oranges", #tgt_prompt
+                    20, #guidance_scale
+                    0.6, #tau
+                    0.4, #crs
+                    0.6, #srs
+                    1, #amplify factor
+                    'oranges', # amplify word
+                    '', #orig blend
+                    'oranges', #edited blend
+                    False #replacement
+                ],
+                [
+                    "examples/orig_3.jpg", #input_image
+                    "a photo of a basket of apples", #src_prompt
+                    "a photo of a basket of puppies", #tgt_prompt
+                    20, #guidance_scale
+                    0.6, #tau
+                    0.4, #crs
+                    0.1, #srs
+                    2, #amplify factor
+                    'puppies', # amplify word
+                    '', #orig blend
+                    'puppies', #edited blend
+                    True #replacement
+                ],
+                [
+                    "examples/orig_3.jpg", #input_image
+                    "a photo of a basket of apples", #src_prompt
+                    "a photo of a basket of apples under snowfall", #tgt_prompt
+                    20, #guidance_scale
+                    0.6, #tau
+                    0.4, #crs
+                    0.4, #srs
+                    30, #amplify factor
+                    'snowfall', # amplify word
+                    '', #orig blend
+                    'snowfall', #edited blend
+                    False #replacement
+                ],
+                [
+                    "examples/orig_1.jpg", #input_image
+                    "a photo of an owl", #src_prompt
+                    "a photo of an yellow owl", #tgt_prompt
+                    20, #guidance_scale
+                    0.6, #tau
+                    0.9, #crs
+                    0.9, #srs
+                    20, #amplify factor
+                    'yellow', # amplify word
+                    'owl', #orig blend
+                    'yellow', #edited blend
+                    False #replacement
+                ],
+               [
+                    "examples/orig_1.jpg", #input_image
+                    "a photo of an owl", #src_prompt
+                    "an anime-style painting of an owl", #tgt_prompt
+                    20, #guidance_scale
+                    0.8, #tau
+                    0.6, #crs
+                    0.3, #srs
+                    10, #amplify factor
+                    'anime-style', # amplify word
+                    'painting', #orig blend
+                    'anime-style', #edited blend
+                    False #replacement
+                ],
+                [
+                    "examples/orig_1.jpg", #input_image
+                    "a photo of an owl", #src_prompt
+                    "a photo of an owl underwater with many fishes nearby", #tgt_prompt
+                    20, #guidance_scale
+                    0.8, #tau
+                    0.4, #crs
+                    0.4, #srs
+                    18, #amplify factor
+                    'fishes', # amplify word
+                    '', #orig blend
+                    'fishes', #edited blend
+                    False #replacement
+                ],
+                [
+                    "examples/orig_2.jpg", #input_image
+                    "a photograph of a teddy bear sitting on a wall", #src_prompt
+                    "a photograph of a teddy bear sitting on a wall surrounded by roses", #tgt_prompt
+                    20, #guidance_scale
+                    0.6, #tau
+                    0.4, #crs
+                    0.1, #srs
+                    25, #amplify factor
+                    'roses', # amplify word
+                    '', #orig blend
+                    'roses', #edited blend
+                    False #replacement
+                ],
+                [
+                    "examples/orig_2.jpg", #input_image
+                    "a photograph of a teddy bear sitting on a wall", #src_prompt
+                    "a photograph of a wooden bear sitting on a wall", #tgt_prompt
+                    20, #guidance_scale
+                    0.8, #tau
+                    0.5, #crs
+                    0.5, #srs
+                    14, #amplify factor
+                    'wooden', # amplify word
+                    '', #orig blend
+                    'wooden', #edited blend
+                    True #replacement
+                ],
+                [
+                    "examples/orig_2.jpg", #input_image
+                    "a photograph of a teddy bear sitting on a wall", #src_prompt
+                    "a photograph of a teddy rabbit sitting on a wall", #tgt_prompt
+                    20, #guidance_scale
+                    0.8, #tau
+                    0.4, #crs
+                    0.4, #srs
+                    3, #amplify factor
+                    'rabbit', # amplify word
+                    '', #orig blend
+                    'rabbit', #edited blend
+                    True #replacement
+                ],
+            ]
+            gr.Examples(
+               examples = examples,
+               inputs =[input_image, input_prompt, prompt,
+                guidance_scale, tau, crs, srs, amplify_factor, amplify_word,
+                blend_orig, blend_edited, is_replacement],
+               outputs=[
+                        result
+                        ],
+               fn=infer, cache_examples=True
+            )
     run_button.click(
         fn = infer,
+        inputs=[input_image, input_prompt, prompt,
+                guidance_scale, tau, crs, srs, amplify_factor, amplify_word,
+                blend_orig, blend_edited, is_replacement],
         outputs = [result]
     )
+demo.queue().launch()

generation.py ADDED Viewed

	@@ -0,0 +1,621 @@

+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+from tqdm import tqdm
+from typing import Union
+from IPython.display import display
+import p2p
+# Main function to run
+# ----------------------------------------------------------------------
+@torch.no_grad()
+def runner(
+        model,
+        prompt,
+        controller,
+        solver,
+        is_cons_forward=False,
+        num_inference_steps=50,
+        guidance_scale=7.5,
+        generator=None,
+        latent=None,
+        uncond_embeddings=None,
+        start_time=50,
+        return_type='image',
+        dynamic_guidance=False,
+        tau1=0.4,
+        tau2=0.6,
+        w_embed_dim=0,
+):
+    p2p.register_attention_control(model, controller)
+    height = width = 512
+    solver.init_prompt(prompt, None)
+    latent, latents = init_latent(latent, model, 512, 512, generator, len(prompt))
+    model.scheduler.set_timesteps(num_inference_steps)
+    dynamic_guidance = True if tau1 < 1.0 or tau1 < 1.0 else False
+    if not is_cons_forward:
+        latents = solver.ddim_loop(latents,
+                                   num_inference_steps,
+                                   is_forward=False,
+                                   guidance_scale=guidance_scale,
+                                   dynamic_guidance=dynamic_guidance,
+                                   tau1=tau1,
+                                   tau2=tau2,
+                                   w_embed_dim=w_embed_dim,
+                                   uncond_embeddings=uncond_embeddings if uncond_embeddings is not None else None,
+                                   controller=controller)
+        latents = latents[-1]
+    else:
+        latents = solver.cons_generation(
+            latents,
+            guidance_scale=guidance_scale,
+            w_embed_dim=w_embed_dim,
+            dynamic_guidance=dynamic_guidance,
+            tau1=tau1,
+            tau2=tau2,
+            controller=controller)
+        latents = latents[-1]
+    if return_type == 'image':
+        image = latent2image(model.vae, latents.to(model.vae.dtype))
+    else:
+        image = latents
+    return image, latent
+# ----------------------------------------------------------------------
+# Utils
+# ----------------------------------------------------------------------
+def linear_schedule_old(t, guidance_scale, tau1, tau2):
+    t = t / 1000
+    if t <= tau1:
+        gamma = 1.0
+    elif t >= tau2:
+        gamma = 0.0
+    else:
+        gamma = (tau2 - t) / (tau2 - tau1)
+    return gamma * guidance_scale
+def linear_schedule(t, guidance_scale, tau1=0.4, tau2=0.8):
+    t = t / 1000
+    if t <= tau1:
+        return guidance_scale
+    if t >= tau2:
+        return 1.0
+    gamma = (tau2 - t) / (tau2 - tau1) * (guidance_scale - 1.0) + 1.0
+    return gamma
+def guidance_scale_embedding(w, embedding_dim=512, dtype=torch.float32):
+    """
+    See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+    Args:
+        timesteps (`torch.Tensor`):
+            generate embedding vectors at these timesteps
+        embedding_dim (`int`, *optional*, defaults to 512):
+            dimension of the embeddings to generate
+        dtype:
+            data type of the generated embeddings
+    Returns:
+        `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+    """
+    assert len(w.shape) == 1
+    w = w * 1000.0
+    half_dim = embedding_dim // 2
+    emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+    emb = w.to(dtype)[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = torch.nn.functional.pad(emb, (0, 1))
+    assert emb.shape == (w.shape[0], embedding_dim)
+    return emb
+# ----------------------------------------------------------------------
+# Diffusion step with scheduler from diffusers and controller for editing
+# ----------------------------------------------------------------------
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+def predicted_origin(model_output, timesteps, boundary_timesteps, sample, prediction_type, alphas, sigmas):
+    sigmas_s = extract_into_tensor(sigmas, boundary_timesteps, sample.shape)
+    alphas_s = extract_into_tensor(alphas, boundary_timesteps, sample.shape)
+    sigmas = extract_into_tensor(sigmas, timesteps, sample.shape)
+    alphas = extract_into_tensor(alphas, timesteps, sample.shape)
+    # Set hard boundaries to ensure equivalence with forward (direct) CD
+    alphas_s[boundary_timesteps == 0] = 1.0
+    sigmas_s[boundary_timesteps == 0] = 0.0
+    if prediction_type == "epsilon":
+        pred_x_0 = (sample - sigmas * model_output) / alphas  # x0 prediction
+        pred_x_0 = alphas_s * pred_x_0 + sigmas_s * model_output  # Euler step to the boundary step
+    elif prediction_type == "v_prediction":
+        assert boundary_timesteps == 0, "v_prediction does not support multiple endpoints at the moment"
+        pred_x_0 = alphas * sample - sigmas * model_output
+    else:
+        raise ValueError(f"Prediction type {prediction_type} currently not supported.")
+    return pred_x_0
+def guided_step(noise_prediction_text,
+                noise_pred_uncond,
+                t,
+                guidance_scale,
+                dynamic_guidance=False,
+                tau1=0.4,
+                tau2=0.6):
+    if dynamic_guidance:
+        if not isinstance(t, int):
+            t = t.item()
+        new_guidance_scale = linear_schedule(t, guidance_scale, tau1=tau1, tau2=tau2)
+    else:
+        new_guidance_scale = guidance_scale
+    noise_pred = noise_pred_uncond + new_guidance_scale * (noise_prediction_text - noise_pred_uncond)
+    return noise_pred
+# ----------------------------------------------------------------------
+# DDIM scheduler with inversion
+# ----------------------------------------------------------------------
+class Generator:
+    def prev_step(self, model_output: Union[torch.FloatTensor, np.ndarray], timestep: int,
+                  sample: Union[torch.FloatTensor, np.ndarray]):
+        prev_timestep = timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.scheduler.alphas_cumprod[
+            prev_timestep] if prev_timestep >= 0 else self.scheduler.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        pred_original_sample = (sample - beta_prod_t ** 0.5 * model_output) / alpha_prod_t ** 0.5
+        pred_sample_direction = (1 - alpha_prod_t_prev) ** 0.5 * model_output
+        prev_sample = alpha_prod_t_prev ** 0.5 * pred_original_sample + pred_sample_direction
+        return prev_sample
+    def next_step(self, model_output: Union[torch.FloatTensor, np.ndarray], timestep: int,
+                  sample: Union[torch.FloatTensor, np.ndarray]):
+        timestep, next_timestep = min(
+            timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps, 999), timestep
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep] if timestep >= 0 else self.scheduler.final_alpha_cumprod
+        alpha_prod_t_next = self.scheduler.alphas_cumprod[next_timestep]
+        beta_prod_t = 1 - alpha_prod_t
+        next_original_sample = (sample - beta_prod_t ** 0.5 * model_output) / alpha_prod_t ** 0.5
+        next_sample_direction = (1 - alpha_prod_t_next) ** 0.5 * model_output
+        next_sample = alpha_prod_t_next ** 0.5 * next_original_sample + next_sample_direction
+        return next_sample
+    def get_noise_pred_single(self, latents, t, context):
+        noise_pred = self.model.unet(latents, t, encoder_hidden_states=context)["sample"]
+        return noise_pred
+    def get_noise_pred(self,
+                       model,
+                       latent,
+                       t,
+                       guidance_scale=1,
+                       context=None,
+                       w_embed_dim=0,
+                       dynamic_guidance=False,
+                       tau1=0.4,
+                       tau2=0.6):
+        latents_input = torch.cat([latent] * 2)
+        if context is None:
+            context = self.context
+        # w embed
+        # --------------------------------------
+        if w_embed_dim > 0:
+            if dynamic_guidance:
+                if not isinstance(t, int):
+                    t_item = t.item()
+                guidance_scale = linear_schedule_old(t_item, guidance_scale, tau1=tau1, tau2=tau2)  # TODO UPDATE
+            if len(latents_input) == 4:
+                guidance_scale_tensor = torch.tensor([0.0, 0.0, 0.0, guidance_scale])
+            else:
+                guidance_scale_tensor = torch.tensor([guidance_scale] * len(latents_input))
+            w_embedding = guidance_scale_embedding(guidance_scale_tensor, embedding_dim=w_embed_dim)
+            w_embedding = w_embedding.to(device=latent.device, dtype=latent.dtype)
+        else:
+            w_embedding = None
+        # --------------------------------------
+        noise_pred = model.unet(latents_input.to(dtype=model.unet.dtype),
+                                t,
+                                timestep_cond=w_embedding.to(dtype=model.unet.dtype) if w_embed_dim > 0 else None,
+                                encoder_hidden_states=context)["sample"]
+        noise_pred_uncond, noise_prediction_text = noise_pred.chunk(2)
+        if guidance_scale > 1 and w_embedding is None:
+            noise_pred = guided_step(noise_prediction_text, noise_pred_uncond, t, guidance_scale, dynamic_guidance,
+                                     tau1, tau2)
+        else:
+            noise_pred = noise_prediction_text
+        return noise_pred
+    @torch.no_grad()
+    def latent2image(self, latents, return_type='np'):
+        latents = 1 / 0.18215 * latents.detach()
+        image = self.model.vae.decode(latents.to(dtype=self.model.dtype))['sample']
+        if return_type == 'np':
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
+            image = (image * 255).astype(np.uint8)
+        return image
+    @torch.no_grad()
+    def image2latent(self, image):
+        with torch.no_grad():
+            if type(image) is Image:
+                image = np.array(image)
+            if type(image) is torch.Tensor and image.dim() == 4:
+                latents = image
+            elif type(image) is list:
+                image = [np.array(i).reshape(1, 512, 512, 3) for i in image]
+                image = np.concatenate(image)
+                image = torch.from_numpy(image).float() / 127.5 - 1
+                image = image.permute(0, 3, 1, 2).to(self.model.device, dtype=self.model.vae.dtype)
+                latents = self.model.vae.encode(image)['latent_dist'].mean
+                latents = latents * 0.18215
+            else:
+                image = torch.from_numpy(image).float() / 127.5 - 1
+                image = image.permute(2, 0, 1).unsqueeze(0).to(self.model.device, dtype=self.model.dtype)
+                latents = self.model.vae.encode(image)['latent_dist'].mean
+                latents = latents * 0.18215
+        return latents
+    @torch.no_grad()
+    def init_prompt(self, prompt, uncond_embeddings=None):
+        if uncond_embeddings is None:
+            uncond_input = self.model.tokenizer(
+                [""], padding="max_length", max_length=self.model.tokenizer.model_max_length,
+                return_tensors="pt"
+            )
+            uncond_embeddings = self.model.text_encoder(uncond_input.input_ids.to(self.model.device))[0]
+        text_input = self.model.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.model.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.model.text_encoder(text_input.input_ids.to(self.model.device))[0]
+        self.context = torch.cat([uncond_embeddings.expand(*text_embeddings.shape), text_embeddings])
+        self.prompt = prompt
+    @torch.no_grad()
+    def ddim_loop(self,
+                  latent,
+                  n_steps,
+                  is_forward=True,
+                  guidance_scale=1,
+                  dynamic_guidance=False,
+                  tau1=0.4,
+                  tau2=0.6,
+                  w_embed_dim=0,
+                  uncond_embeddings=None,
+                  controller=None):
+        all_latent = [latent]
+        latent = latent.clone().detach()
+        for i in tqdm(range(n_steps)):
+            if uncond_embeddings is not None:
+                self.init_prompt(self.prompt, uncond_embeddings[i])
+            if is_forward:
+                t = self.model.scheduler.timesteps[len(self.model.scheduler.timesteps) - i - 1]
+            else:
+                t = self.model.scheduler.timesteps[i]
+            noise_pred = self.get_noise_pred(
+                model=self.model,
+                latent=latent,
+                t=t,
+                context=None,
+                guidance_scale=guidance_scale,
+                dynamic_guidance=dynamic_guidance,
+                w_embed_dim=w_embed_dim,
+                tau1=tau1,
+                tau2=tau2)
+            if is_forward:
+                latent = self.next_step(noise_pred, t, latent)
+            else:
+                latent = self.prev_step(noise_pred, t, latent)
+            if controller is not None:
+                latent = controller.step_callback(latent)
+            all_latent.append(latent)
+        return all_latent
+    @property
+    def scheduler(self):
+        return self.model.scheduler
+    @torch.no_grad()
+    def ddim_inversion(self,
+                       image,
+                       n_steps=None,
+                       guidance_scale=1,
+                       dynamic_guidance=False,
+                       tau1=0.4,
+                       tau2=0.6,
+                       w_embed_dim=0):
+        if n_steps is None:
+            n_steps = self.n_steps
+        latent = self.image2latent(image)
+        image_rec = self.latent2image(latent)
+        ddim_latents = self.ddim_loop(latent,
+                                      is_forward=True,
+                                      guidance_scale=guidance_scale,
+                                      n_steps=n_steps,
+                                      dynamic_guidance=dynamic_guidance,
+                                      tau1=tau1,
+                                      tau2=tau2,
+                                      w_embed_dim=w_embed_dim)
+        return image_rec, ddim_latents
+    @torch.no_grad()
+    def cons_generation(self,
+                        latent,
+                        guidance_scale=1,
+                        dynamic_guidance=False,
+                        tau1=0.4,
+                        tau2=0.6,
+                        w_embed_dim=0,
+                        controller=None, ):
+        all_latent = [latent]
+        latent = latent.clone().detach()
+        alpha_schedule = torch.sqrt(self.model.scheduler.alphas_cumprod).to(self.model.device)
+        sigma_schedule = torch.sqrt(1 - self.model.scheduler.alphas_cumprod).to(self.model.device)
+        for i, (t, s) in enumerate(tqdm(zip(self.reverse_timesteps, self.reverse_boundary_timesteps))):
+            noise_pred = self.get_noise_pred(
+                model=self.reverse_cons_model,
+                latent=latent,
+                t=t.to(self.model.device),
+                context=None,
+                tau1=tau1, tau2=tau2,
+                w_embed_dim=w_embed_dim,
+                guidance_scale=guidance_scale,
+                dynamic_guidance=dynamic_guidance)
+            latent = predicted_origin(
+                noise_pred,
+                torch.tensor([t] * len(latent), device=self.model.device),
+                torch.tensor([s] * len(latent), device=self.model.device),
+                latent,
+                self.model.scheduler.config.prediction_type,
+                alpha_schedule,
+                sigma_schedule,
+            )
+            if controller is not None:
+                latent = controller.step_callback(latent)
+            all_latent.append(latent)
+        return all_latent
+    @torch.no_grad()
+    def cons_inversion(self,
+                       image,
+                       guidance_scale=0.0,
+                       w_embed_dim=0,
+                       seed=0):
+        alpha_schedule = torch.sqrt(self.model.scheduler.alphas_cumprod).to(self.model.device)
+        sigma_schedule = torch.sqrt(1 - self.model.scheduler.alphas_cumprod).to(self.model.device)
+        # 5. Prepare latent variables
+        latent = self.image2latent(image)
+        generator = torch.Generator().manual_seed(seed)
+        noise = torch.randn(latent.shape, generator=generator).to(latent.device)
+        latent = self.noise_scheduler.add_noise(latent, noise, torch.tensor([self.start_timestep]))
+        image_rec = self.latent2image(latent)
+        for i, (t, s) in enumerate(tqdm(zip(self.forward_timesteps, self.forward_boundary_timesteps))):
+            # predict the noise residual
+            noise_pred = self.get_noise_pred(
+                model=self.forward_cons_model,
+                latent=latent,
+                t=t.to(self.model.device),
+                context=None,
+                guidance_scale=guidance_scale,
+                w_embed_dim=w_embed_dim,
+                dynamic_guidance=False)
+            latent = predicted_origin(
+                noise_pred,
+                torch.tensor([t] * len(latent), device=self.model.device),
+                torch.tensor([s] * len(latent), device=self.model.device),
+                latent,
+                self.model.scheduler.config.prediction_type,
+                alpha_schedule,
+                sigma_schedule,
+            )
+        return image_rec, [latent]
+    def _create_forward_inverse_timesteps(self,
+                                          num_endpoints,
+                                          n_steps,
+                                          max_inverse_timestep_index):
+        timestep_interval = n_steps // num_endpoints + int(n_steps % num_endpoints > 0)
+        endpoint_idxs = torch.arange(timestep_interval, n_steps, timestep_interval) - 1
+        inverse_endpoint_idxs = torch.arange(timestep_interval, n_steps, timestep_interval) - 1
+        inverse_endpoint_idxs = torch.tensor(inverse_endpoint_idxs.tolist() + [max_inverse_timestep_index])
+        endpoints = torch.tensor([0] + self.ddim_timesteps[endpoint_idxs].tolist())
+        inverse_endpoints = self.ddim_timesteps[inverse_endpoint_idxs]
+        return endpoints, inverse_endpoints
+    def __init__(self,
+                 model,
+                 n_steps,
+                 noise_scheduler,
+                 forward_cons_model=None,
+                 reverse_cons_model=None,
+                 num_endpoints=1,
+                 num_forward_endpoints=1,
+                 reverse_timesteps=None,
+                 forward_timesteps=None,
+                 max_forward_timestep_index=49,
+                 start_timestep=19):
+        self.model = model
+        self.forward_cons_model = forward_cons_model
+        self.reverse_cons_model = reverse_cons_model
+        self.noise_scheduler = noise_scheduler
+        self.n_steps = n_steps
+        self.tokenizer = self.model.tokenizer
+        self.model.scheduler.set_timesteps(n_steps)
+        self.prompt = None
+        self.context = None
+        step_ratio = 1000 // n_steps
+        self.ddim_timesteps = (np.arange(1, n_steps + 1) * step_ratio).round().astype(np.int64) - 1
+        self.ddim_timesteps = torch.from_numpy(self.ddim_timesteps).long()
+        self.start_timestep = start_timestep
+        # Set endpoints for direct CTM
+        if reverse_timesteps is None or forward_timesteps is None:
+            endpoints, inverse_endpoints = self._create_forward_inverse_timesteps(num_endpoints, n_steps,
+                                                                                  max_forward_timestep_index)
+            self.reverse_timesteps, self.reverse_boundary_timesteps = inverse_endpoints.flip(0), endpoints.flip(0)
+            # Set endpoints for forward CTM
+            endpoints, inverse_endpoints = self._create_forward_inverse_timesteps(num_forward_endpoints, n_steps,
+                                                                                  max_forward_timestep_index)
+            self.forward_timesteps, self.forward_boundary_timesteps = endpoints, inverse_endpoints
+            self.forward_timesteps[0] = self.start_timestep
+        else:
+            self.reverse_timesteps, self.reverse_boundary_timesteps = reverse_timesteps, reverse_timesteps
+            self.reverse_timesteps.reverse()
+            self.reverse_boundary_timesteps = self.reverse_boundary_timesteps[1:] + [self.reverse_boundary_timesteps[0]]
+            self.reverse_boundary_timesteps[-1] = 0
+            self.reverse_timesteps, self.reverse_boundary_timesteps = torch.tensor(reverse_timesteps), torch.tensor(
+                self.reverse_boundary_timesteps)
+            self.forward_timesteps, self.forward_boundary_timesteps = forward_timesteps, forward_timesteps
+            self.forward_boundary_timesteps = self.forward_boundary_timesteps[1:] + [self.forward_boundary_timesteps[0]]
+            self.forward_boundary_timesteps[-1] = 999
+            self.forward_timesteps, self.forward_boundary_timesteps = torch.tensor(
+                self.forward_timesteps), torch.tensor(self.forward_boundary_timesteps)
+        print(f"Endpoints reverse CTM: {self.reverse_timesteps}, {self.reverse_boundary_timesteps}")
+        print(f"Endpoints forward CTM: {self.forward_timesteps}, {self.forward_boundary_timesteps}")
+# ----------------------------------------------------------------------
+# 3rd party utils
+# ----------------------------------------------------------------------
+def latent2image(vae, latents):
+    latents = 1 / 0.18215 * latents
+    image = vae.decode(latents)['sample']
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.cpu().permute(0, 2, 3, 1).numpy()
+    image = (image * 255).astype(np.uint8)
+    return image
+def init_latent(latent, model, height, width, generator, batch_size):
+    if latent is None:
+        latent = torch.randn(
+            (1, model.unet.in_channels, height // 8, width // 8),
+            generator=generator,
+        )
+    latents = latent.expand(batch_size, model.unet.in_channels, height // 8, width // 8).to(model.device)
+    return latent, latents
+def load_512(image_path, left=0, right=0, top=0, bottom=0):
+    # if type(image_path) is str:
+    #    image = np.array(Image.open(image_path))[:, :, :3]
+    # else:
+    #    image = image_path
+    # h, w, c = image.shape
+    # left = min(left, w - 1)
+    # right = min(right, w - left - 1)
+    # top = min(top, h - left - 1)
+    # bottom = min(bottom, h - top - 1)
+    # image = image[top:h - bottom, left:w - right]
+    # h, w, c = image.shape
+    # if h < w:
+    #    offset = (w - h) // 2
+    #    image = image[:, offset:offset + h]
+    # elif w < h:
+    #    offset = (h - w) // 2
+    #    image = image[offset:offset + w]
+    image = np.array(Image.open(image_path).convert('RGB'))[:, :, :3]
+    image = np.array(Image.fromarray(image).resize((512, 512)))
+    return image
+def to_pil_images(images, num_rows=1, offset_ratio=0.02):
+    if type(images) is list:
+        num_empty = len(images) % num_rows
+    elif images.ndim == 4:
+        num_empty = images.shape[0] % num_rows
+    else:
+        images = [images]
+        num_empty = 0
+    empty_images = np.ones(images[0].shape, dtype=np.uint8) * 255
+    images = [image.astype(np.uint8) for image in images] + [empty_images] * num_empty
+    num_items = len(images)
+    h, w, c = images[0].shape
+    offset = int(h * offset_ratio)
+    num_cols = num_items // num_rows
+    image_ = np.ones((h * num_rows + offset * (num_rows - 1),
+                      w * num_cols + offset * (num_cols - 1), 3), dtype=np.uint8) * 255
+    for i in range(num_rows):
+        for j in range(num_cols):
+            image_[i * (h + offset): i * (h + offset) + h:, j * (w + offset): j * (w + offset) + w] = images[
+                i * num_cols + j]
+    pil_img = Image.fromarray(image_)
+    return pil_img
+def view_images(images, num_rows=1, offset_ratio=0.02):
+    if type(images) is list:
+        num_empty = len(images) % num_rows
+    elif images.ndim == 4:
+        num_empty = images.shape[0] % num_rows
+    else:
+        images = [images]
+        num_empty = 0
+    empty_images = np.ones(images[0].shape, dtype=np.uint8) * 255
+    images = [image.astype(np.uint8) for image in images] + [empty_images] * num_empty
+    num_items = len(images)
+    h, w, c = images[0].shape
+    offset = int(h * offset_ratio)
+    num_cols = num_items // num_rows
+    image_ = np.ones((h * num_rows + offset * (num_rows - 1),
+                      w * num_cols + offset * (num_cols - 1), 3), dtype=np.uint8) * 255
+    for i in range(num_rows):
+        for j in range(num_cols):
+            image_[i * (h + offset): i * (h + offset) + h:, j * (w + offset): j * (w + offset) + w] = images[
+                i * num_cols + j]
+    pil_img = Image.fromarray(image_)
+    display(pil_img)
+# ----------------------------------------------------------------------

inversion.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import torch.nn.functional as nnf
+import torch
+import numpy as np
+from tqdm import tqdm
+from torch.optim.adam import Adam
+from PIL import Image
+from generation import load_512
+from p2p import register_attention_control
+def null_optimization(solver,
+                      latents,
+                      guidance_scale,
+                      num_inner_steps,
+                      epsilon):
+    uncond_embeddings, cond_embeddings = solver.context.chunk(2)
+    uncond_embeddings_list = []
+    latent_cur = latents[-1]
+    bar = tqdm(total=num_inner_steps * solver.n_steps)
+    for i in range(solver.n_steps):
+        uncond_embeddings = uncond_embeddings.clone().detach()
+        uncond_embeddings.requires_grad = True
+        optimizer = Adam([uncond_embeddings], lr=1e-2 * (1. - i / 100.))
+        latent_prev = latents[len(latents) - i - 2]
+        t = solver.model.scheduler.timesteps[i]
+        with torch.no_grad():
+            noise_pred_cond = solver.get_noise_pred_single(latent_cur, t, cond_embeddings)
+        for j in range(num_inner_steps):
+            noise_pred_uncond = solver.get_noise_pred_single(latent_cur, t, uncond_embeddings)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_cond - noise_pred_uncond)
+            latents_prev_rec = solver.prev_step(noise_pred, t, latent_cur)
+            loss = nnf.mse_loss(latents_prev_rec, latent_prev)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            loss_item = loss.item()
+            bar.update()
+            if loss_item < epsilon + i * 2e-5:
+                break
+        for j in range(j + 1, num_inner_steps):
+            bar.update()
+        uncond_embeddings_list.append(uncond_embeddings[:1].detach())
+        with torch.no_grad():
+            context = torch.cat([uncond_embeddings, cond_embeddings])
+            noise_pred = solver.get_noise_pred(solver.model, latent_cur, t, guidance_scale, context)
+            latent_cur = solver.prev_step(noise_pred, t, latent_cur)
+    bar.close()
+    return uncond_embeddings_list
+def invert(solver,
+           stop_step,
+           is_cons_inversion=False,
+           inv_guidance_scale=1,
+           nti_guidance_scale=8,
+           dynamic_guidance=False,
+           tau1=0.4,
+           tau2=0.6,
+           w_embed_dim=0,
+           image_path=None,
+           prompt='',
+           offsets=(0, 0, 0, 0),
+           do_nti=False,
+           do_npi=False,
+           num_inner_steps=10,
+           early_stop_epsilon=1e-5,
+           seed=0,
+           ):
+    solver.init_prompt(prompt)
+    uncond_embeddings, cond_embeddings = solver.context.chunk(2)
+    register_attention_control(solver.model, None)
+    if isinstance(image_path, list):
+        image_gt = [load_512(path, *offsets) for path in image_path]
+    elif isinstance(image_path, str):
+        image_gt = load_512(image_path, *offsets)
+    else:
+        image_gt = np.array(Image.fromarray(image_path).resize((512, 512)))
+    if is_cons_inversion:
+        image_rec, ddim_latents = solver.cons_inversion(image_gt,
+                                                        w_embed_dim=w_embed_dim,
+                                                        guidance_scale=inv_guidance_scale,
+                                                        seed=seed,)
+    else:
+        image_rec, ddim_latents = solver.ddim_inversion(image_gt,
+                                                        n_steps=stop_step,
+                                                        guidance_scale=inv_guidance_scale,
+                                                        dynamic_guidance=dynamic_guidance,
+                                                        tau1=tau1, tau2=tau2,
+                                                        w_embed_dim=w_embed_dim)
+    if do_nti:
+        print("Null-text optimization...")
+        uncond_embeddings = null_optimization(solver,
+                                              ddim_latents,
+                                              nti_guidance_scale,
+                                              num_inner_steps,
+                                              early_stop_epsilon)
+    elif do_npi:
+        uncond_embeddings = [cond_embeddings] * solver.n_steps
+    else:
+        uncond_embeddings = None
+    return (image_gt, image_rec), ddim_latents[-1], uncond_embeddings

p2p.py ADDED Viewed

	@@ -0,0 +1,454 @@

+import torch.nn.functional as nnf
+import torch
+import abc
+import numpy as np
+import seq_aligner
+from typing import Optional, Union, Tuple, List, Callable, Dict
+MAX_NUM_WORDS = 77
+LOW_RESOURCE = False
+NUM_DDIM_STEPS = 50
+device = 'cuda'
+tokenizer = None
+# Different attention controllers
+# ----------------------------------------------------------------------
+class LocalBlend:
+    def get_mask(self, maps, alpha, use_pool, x_t):
+        k = 1
+        maps = (maps * alpha).sum(-1).mean(1)
+        if use_pool:
+            maps = nnf.max_pool2d(maps, (k * 2 + 1, k * 2 + 1), (1, 1), padding=(k, k))
+        mask = nnf.interpolate(maps, size=(x_t.shape[2:]))
+        mask = mask / mask.max(2, keepdims=True)[0].max(3, keepdims=True)[0]
+        mask = mask.gt(self.th[1 - int(use_pool)])
+        mask = mask[:1] + mask
+        return mask
+    def __call__(self, x_t, attention_store):
+        self.counter += 1
+        if self.counter > self.start_blend:
+            maps = attention_store["down_cross"][2:4] + attention_store["up_cross"][:3]
+            maps = [item.reshape(self.alpha_layers.shape[0], -1, 1, 16, 16, MAX_NUM_WORDS) for item in maps]
+            maps = torch.cat(maps, dim=1)
+            mask = self.get_mask(maps, self.alpha_layers, True, x_t)
+            if self.substruct_layers is not None:
+                maps_sub = ~self.get_mask(maps, self.substruct_layers, False, x_t)
+                mask = mask * maps_sub
+            mask = mask.float()
+            x_t = x_t[:1] + mask * (x_t - x_t[:1])
+        return x_t
+    def __init__(self, prompts: List[str], words: [List[List[str]]], substruct_words=None, start_blend=0.2,
+                 th=(.3, .3)):
+        alpha_layers = torch.zeros(len(prompts), 1, 1, 1, 1, MAX_NUM_WORDS)
+        for i, (prompt, words_) in enumerate(zip(prompts, words)):
+            if type(words_) is str:
+                words_ = [words_]
+            for word in words_:
+                ind = get_word_inds(prompt, word, tokenizer)
+                alpha_layers[i, :, :, :, :, ind] = 1
+        if substruct_words is not None:
+            substruct_layers = torch.zeros(len(prompts), 1, 1, 1, 1, MAX_NUM_WORDS)
+            for i, (prompt, words_) in enumerate(zip(prompts, substruct_words)):
+                if type(words_) is str:
+                    words_ = [words_]
+                for word in words_:
+                    ind = get_word_inds(prompt, word, tokenizer)
+                    substruct_layers[i, :, :, :, :, ind] = 1
+            self.substruct_layers = substruct_layers.to(device)
+        else:
+            self.substruct_layers = None
+        self.alpha_layers = alpha_layers.to(device)
+        self.start_blend = int(start_blend * NUM_DDIM_STEPS)
+        self.counter = 0
+        self.th = th
+class EmptyControl:
+    def step_callback(self, x_t):
+        return x_t
+    def between_steps(self):
+        return
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        return attn
+class AttentionControl(abc.ABC):
+    def step_callback(self, x_t):
+        return x_t
+    def between_steps(self):
+        return
+    @property
+    def num_uncond_att_layers(self):
+        return self.num_att_layers if LOW_RESOURCE else 0
+    @abc.abstractmethod
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        raise NotImplementedError
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        if self.cur_att_layer >= self.num_uncond_att_layers:
+            if LOW_RESOURCE:
+                attn = self.forward(attn, is_cross, place_in_unet)
+            else:
+                h = attn.shape[0]
+                attn[h // 2:] = self.forward(attn[h // 2:], is_cross, place_in_unet)
+        self.cur_att_layer += 1
+        if self.cur_att_layer == self.num_att_layers + self.num_uncond_att_layers:
+            self.cur_att_layer = 0
+            self.cur_step += 1
+            self.between_steps()
+        return attn
+    def reset(self):
+        self.cur_step = 0
+        self.cur_att_layer = 0
+    def __init__(self):
+        self.cur_step = 0
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+class SpatialReplace(EmptyControl):
+    def step_callback(self, x_t):
+        if self.cur_step < self.stop_inject:
+            b = x_t.shape[0]
+            x_t = x_t[:1].expand(b, *x_t.shape[1:])
+        return x_t
+    def __init__(self, stop_inject: float):
+        super(SpatialReplace, self).__init__()
+        self.stop_inject = int((1 - stop_inject) * NUM_DDIM_STEPS)
+class AttentionStore(AttentionControl):
+    @staticmethod
+    def get_empty_store():
+        return {"down_cross": [], "mid_cross": [], "up_cross": [],
+                "down_self": [], "mid_self": [], "up_self": []}
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+        if attn.shape[1] <= 32 ** 2:  # avoid memory overhead
+            self.step_store[key].append(attn)
+        return attn
+    def between_steps(self):
+        if len(self.attention_store) == 0:
+            self.attention_store = self.step_store
+        else:
+            for key in self.attention_store:
+                for i in range(len(self.attention_store[key])):
+                    self.attention_store[key][i] += self.step_store[key][i]
+        self.step_store = self.get_empty_store()
+    def get_average_attention(self):
+        average_attention = {key: [item / self.cur_step for item in self.attention_store[key]] for key in
+                             self.attention_store}
+        return average_attention
+    def reset(self):
+        super(AttentionStore, self).reset()
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+    def __init__(self):
+        super(AttentionStore, self).__init__()
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+class AttentionControlEdit(AttentionStore, abc.ABC):
+    def step_callback(self, x_t):
+        if self.local_blend is not None:
+            x_t = self.local_blend(x_t, self.attention_store)
+        return x_t
+    def replace_self_attention(self, attn_base, att_replace, place_in_unet):
+        if att_replace.shape[2] <= 32 ** 2:
+            attn_base = attn_base.unsqueeze(0).expand(att_replace.shape[0], *attn_base.shape)
+            return attn_base
+        else:
+            return att_replace
+    @abc.abstractmethod
+    def replace_cross_attention(self, attn_base, att_replace):
+        raise NotImplementedError
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        super(AttentionControlEdit, self).forward(attn, is_cross, place_in_unet)
+        if is_cross or (self.num_self_replace[0] <= self.cur_step < self.num_self_replace[1]):
+            h = attn.shape[0] // (self.batch_size)
+            attn = attn.reshape(self.batch_size, h, *attn.shape[1:])
+            attn_base, attn_repalce = attn[0], attn[1:]
+            if is_cross:
+                alpha_words = self.cross_replace_alpha[self.cur_step]
+                attn_repalce_new = self.replace_cross_attention(attn_base, attn_repalce) * alpha_words + (
+                            1 - alpha_words) * attn_repalce
+                attn[1:] = attn_repalce_new
+            else:
+                attn[1:] = self.replace_self_attention(attn_base, attn_repalce, place_in_unet)
+            attn = attn.reshape(self.batch_size * h, *attn.shape[2:])
+        return attn
+    def __init__(self, prompts, num_steps: int,
+                 cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]],
+                 self_replace_steps: Union[float, Tuple[float, float]],
+                 local_blend: Optional[LocalBlend]):
+        super(AttentionControlEdit, self).__init__()
+        self.batch_size = len(prompts)
+        self.cross_replace_alpha = get_time_words_attention_alpha(prompts, num_steps, cross_replace_steps,
+                                                                  tokenizer).to(device)
+        if type(self_replace_steps) is float:
+            self_replace_steps = 0, self_replace_steps
+        self.num_self_replace = int(num_steps * self_replace_steps[0]), int(num_steps * self_replace_steps[1])
+        self.local_blend = local_blend
+class AttentionReplace(AttentionControlEdit):
+    def replace_cross_attention(self, attn_base, att_replace):
+        return torch.einsum('hpw,bwn->bhpn', attn_base, self.mapper)
+    def __init__(self, prompts, num_steps: int, cross_replace_steps: float, self_replace_steps: float,
+                 local_blend: Optional[LocalBlend] = None):
+        super(AttentionReplace, self).__init__(prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend)
+        self.mapper = seq_aligner.get_replacement_mapper(prompts, tokenizer).to(device)
+class AttentionRefine(AttentionControlEdit):
+    def replace_cross_attention(self, attn_base, att_replace):
+        attn_base_replace = attn_base[:, :, self.mapper].permute(2, 0, 1, 3)
+        attn_replace = attn_base_replace * self.alphas + att_replace * (1 - self.alphas)
+        # attn_replace = attn_replace / attn_replace.sum(-1, keepdims=True)
+        return attn_replace
+    def __init__(self, prompts, num_steps: int, cross_replace_steps: float, self_replace_steps: float,
+                 local_blend: Optional[LocalBlend] = None):
+        super(AttentionRefine, self).__init__(prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend)
+        self.mapper, alphas = seq_aligner.get_refinement_mapper(prompts, tokenizer)
+        self.mapper, alphas = self.mapper.to(device), alphas.to(device)
+        self.alphas = alphas.reshape(alphas.shape[0], 1, 1, alphas.shape[1])
+class AttentionReweight(AttentionControlEdit):
+    def replace_cross_attention(self, attn_base, att_replace):
+        if self.prev_controller is not None:
+            attn_base = self.prev_controller.replace_cross_attention(attn_base, att_replace)
+        attn_replace = attn_base[None, :, :, :] * self.equalizer[:, None, None, :]
+        # attn_replace = attn_replace / attn_replace.sum(-1, keepdims=True)
+        return attn_replace
+    def __init__(self, prompts, num_steps: int, cross_replace_steps: float, self_replace_steps: float, equalizer,
+                 local_blend: Optional[LocalBlend] = None, controller: Optional[AttentionControlEdit] = None):
+        super(AttentionReweight, self).__init__(prompts, num_steps, cross_replace_steps, self_replace_steps,
+                                                local_blend)
+        self.equalizer = equalizer.to(device)
+        self.prev_controller = controller
+        self.attn = []
+# ----------------------------------------------------------------------
+# Attention controller during sampling
+# ----------------------------------------------------------------------
+def make_controller(prompts: List[str], is_replace_controller: bool, cross_replace_steps: Dict[str, float],
+                    self_replace_steps: float, blend_words=None, equilizer_params=None) -> AttentionControlEdit:
+    if blend_words is None:
+        lb = None
+    else:
+        lb = LocalBlend(prompts, blend_words, start_blend=0.0, th=(0.3, 0.3))
+    if is_replace_controller:
+        controller = AttentionReplace(prompts, NUM_DDIM_STEPS, cross_replace_steps=cross_replace_steps,
+                                      self_replace_steps=self_replace_steps, local_blend=lb)
+    else:
+        controller = AttentionRefine(prompts, NUM_DDIM_STEPS, cross_replace_steps=cross_replace_steps,
+                                     self_replace_steps=self_replace_steps, local_blend=lb)
+    if equilizer_params is not None:
+        eq = get_equalizer(prompts[1], equilizer_params["words"], equilizer_params["values"])
+        controller = AttentionReweight(prompts, NUM_DDIM_STEPS, cross_replace_steps=cross_replace_steps,
+                                       self_replace_steps=self_replace_steps, equalizer=eq, local_blend=lb,
+                                       controller=controller)
+    return controller
+def register_attention_control(model, controller):
+    def ca_forward(self, place_in_unet):
+        to_out = self.to_out
+        if type(to_out) is torch.nn.modules.container.ModuleList:
+            to_out = self.to_out[0]
+        else:
+            to_out = self.to_out
+        def forward(hidden_states, encoder_hidden_states=None, attention_mask=None, temb=None, ):
+            is_cross = encoder_hidden_states is not None
+            residual = hidden_states
+            if self.spatial_norm is not None:
+                hidden_states = self.spatial_norm(hidden_states, temb)
+            input_ndim = hidden_states.ndim
+            if input_ndim == 4:
+                batch_size, channel, height, width = hidden_states.shape
+                hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+            batch_size, sequence_length, _ = (
+                hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+            )
+            attention_mask = self.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            if self.group_norm is not None:
+                hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+            query = self.to_q(hidden_states)
+            if encoder_hidden_states is None:
+                encoder_hidden_states = hidden_states
+            elif self.norm_cross:
+                encoder_hidden_states = self.norm_encoder_hidden_states(encoder_hidden_states)
+            key = self.to_k(encoder_hidden_states)
+            value = self.to_v(encoder_hidden_states)
+            query = self.head_to_batch_dim(query)
+            key = self.head_to_batch_dim(key)
+            value = self.head_to_batch_dim(value)
+            attention_probs = self.get_attention_scores(query, key, attention_mask)
+            attention_probs = controller(attention_probs, is_cross, place_in_unet)
+            hidden_states = torch.bmm(attention_probs, value)
+            hidden_states = self.batch_to_head_dim(hidden_states)
+            # linear proj
+            hidden_states = to_out(hidden_states)
+            if input_ndim == 4:
+                hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+            if self.residual_connection:
+                hidden_states = hidden_states + residual
+            hidden_states = hidden_states / self.rescale_output_factor
+            return hidden_states
+        return forward
+    class DummyController:
+        def __call__(self, *args):
+            return args[0]
+        def __init__(self):
+            self.num_att_layers = 0
+    if controller is None:
+        controller = DummyController()
+    def register_recr(net_, count, place_in_unet):
+        if net_.__class__.__name__ == 'Attention':
+            net_.forward = ca_forward(net_, place_in_unet)
+            return count + 1
+        elif hasattr(net_, 'children'):
+            for net__ in net_.children():
+                count = register_recr(net__, count, place_in_unet)
+        return count
+    cross_att_count = 0
+    sub_nets = model.unet.named_children()
+    for net in sub_nets:
+        if "down" in net[0]:
+            cross_att_count += register_recr(net[1], 0, "down")
+        elif "up" in net[0]:
+            cross_att_count += register_recr(net[1], 0, "up")
+        elif "mid" in net[0]:
+            cross_att_count += register_recr(net[1], 0, "mid")
+    controller.num_att_layers = cross_att_count
+# ----------------------------------------------------------------------
+# Other
+# ----------------------------------------------------------------------
+def get_equalizer(text: str, word_select: Union[int, Tuple[int, ...]], values: Union[List[float],
+                                                                                     Tuple[float, ...]]):
+    if type(word_select) is int or type(word_select) is str:
+        word_select = (word_select,)
+    equalizer = torch.ones(1, 77)
+    for word, val in zip(word_select, values):
+        inds = get_word_inds(text, word, tokenizer)
+        equalizer[:, inds] = val
+    return equalizer
+def get_time_words_attention_alpha(prompts, num_steps,
+                                   cross_replace_steps: Union[float, Dict[str, Tuple[float, float]]],
+                                   tokenizer, max_num_words=77):
+    if type(cross_replace_steps) is not dict:
+        cross_replace_steps = {"default_": cross_replace_steps}
+    if "default_" not in cross_replace_steps:
+        cross_replace_steps["default_"] = (0., 1.)
+    alpha_time_words = torch.zeros(num_steps + 1, len(prompts) - 1, max_num_words)
+    for i in range(len(prompts) - 1):
+        alpha_time_words = update_alpha_time_word(alpha_time_words, cross_replace_steps["default_"],
+                                                  i)
+    for key, item in cross_replace_steps.items():
+        if key != "default_":
+            inds = [get_word_inds(prompts[i], key, tokenizer) for i in range(1, len(prompts))]
+            for i, ind in enumerate(inds):
+                if len(ind) > 0:
+                    alpha_time_words = update_alpha_time_word(alpha_time_words, item, i, ind)
+    alpha_time_words = alpha_time_words.reshape(num_steps + 1, len(prompts) - 1, 1, 1, max_num_words)
+    return alpha_time_words
+def get_word_inds(text: str, word_place: int, tokenizer):
+    split_text = text.split(" ")
+    if type(word_place) is str:
+        word_place = [i for i, word in enumerate(split_text) if word_place == word]
+    elif type(word_place) is int:
+        word_place = [word_place]
+    out = []
+    if len(word_place) > 0:
+        words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text)][1:-1]
+        cur_len, ptr = 0, 0
+        for i in range(len(words_encode)):
+            cur_len += len(words_encode[i])
+            if ptr in word_place:
+                out.append(i + 1)
+            if cur_len >= len(split_text[ptr]):
+                ptr += 1
+                cur_len = 0
+    return np.array(out)
+def update_alpha_time_word(alpha, bounds: Union[float, Tuple[float, float]], prompt_ind: int,
+                           word_inds: Optional[torch.Tensor] = None):
+    if type(bounds) is float:
+        bounds = 0, bounds
+    start, end = int(bounds[0] * alpha.shape[0]), int(bounds[1] * alpha.shape[0])
+    if word_inds is None:
+        word_inds = torch.arange(alpha.shape[2])
+    alpha[: start, prompt_ind, word_inds] = 0
+    alpha[start: end, prompt_ind, word_inds] = 1
+    alpha[end:, prompt_ind, word_inds] = 0
+    return alpha
+# ----------------------------------------------------------------------

requirements.txt CHANGED Viewed

@@ -2,5 +2,7 @@ accelerate
 diffusers
 invisible_watermark
 torch
 transformers
-xformers

 diffusers
 invisible_watermark
 torch
+peft
 transformers
+xformers
+ipython

seq_aligner.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import torch
+import numpy as np
+class ScoreParams:
+    def __init__(self, gap, match, mismatch):
+        self.gap = gap
+        self.match = match
+        self.mismatch = mismatch
+    def mis_match_char(self, x, y):
+        if x != y:
+            return self.mismatch
+        else:
+            return self.match
+def get_matrix(size_x, size_y, gap):
+    matrix = []
+    for i in range(len(size_x) + 1):
+        sub_matrix = []
+        for j in range(len(size_y) + 1):
+            sub_matrix.append(0)
+        matrix.append(sub_matrix)
+    for j in range(1, len(size_y) + 1):
+        matrix[0][j] = j * gap
+    for i in range(1, len(size_x) + 1):
+        matrix[i][0] = i * gap
+    return matrix
+def get_matrix(size_x, size_y, gap):
+    matrix = np.zeros((size_x + 1, size_y + 1), dtype=np.int32)
+    matrix[0, 1:] = (np.arange(size_y) + 1) * gap
+    matrix[1:, 0] = (np.arange(size_x) + 1) * gap
+    return matrix
+def get_traceback_matrix(size_x, size_y):
+    matrix = np.zeros((size_x + 1, size_y + 1), dtype=np.int32)
+    matrix[0, 1:] = 1
+    matrix[1:, 0] = 2
+    matrix[0, 0] = 4
+    return matrix
+def global_align(x, y, score):
+    matrix = get_matrix(len(x), len(y), score.gap)
+    trace_back = get_traceback_matrix(len(x), len(y))
+    for i in range(1, len(x) + 1):
+        for j in range(1, len(y) + 1):
+            left = matrix[i, j - 1] + score.gap
+            up = matrix[i - 1, j] + score.gap
+            diag = matrix[i - 1, j - 1] + score.mis_match_char(x[i - 1], y[j - 1])
+            matrix[i, j] = max(left, up, diag)
+            if matrix[i, j] == left:
+                trace_back[i, j] = 1
+            elif matrix[i, j] == up:
+                trace_back[i, j] = 2
+            else:
+                trace_back[i, j] = 3
+    return matrix, trace_back
+def get_aligned_sequences(x, y, trace_back):
+    x_seq = []
+    y_seq = []
+    i = len(x)
+    j = len(y)
+    mapper_y_to_x = []
+    while i > 0 or j > 0:
+        if trace_back[i, j] == 3:
+            x_seq.append(x[i - 1])
+            y_seq.append(y[j - 1])
+            i = i - 1
+            j = j - 1
+            mapper_y_to_x.append((j, i))
+        elif trace_back[i][j] == 1:
+            x_seq.append('-')
+            y_seq.append(y[j - 1])
+            j = j - 1
+            mapper_y_to_x.append((j, -1))
+        elif trace_back[i][j] == 2:
+            x_seq.append(x[i - 1])
+            y_seq.append('-')
+            i = i - 1
+        elif trace_back[i][j] == 4:
+            break
+    mapper_y_to_x.reverse()
+    return x_seq, y_seq, torch.tensor(mapper_y_to_x, dtype=torch.int64)
+def get_mapper(x: str, y: str, tokenizer, max_len=77):
+    x_seq = tokenizer.encode(x)
+    y_seq = tokenizer.encode(y)
+    score = ScoreParams(0, 1, -1)
+    matrix, trace_back = global_align(x_seq, y_seq, score)
+    mapper_base = get_aligned_sequences(x_seq, y_seq, trace_back)[-1]
+    alphas = torch.ones(max_len)
+    alphas[: mapper_base.shape[0]] = mapper_base[:, 1].ne(-1).float()
+    mapper = torch.zeros(max_len, dtype=torch.int64)
+    mapper[:mapper_base.shape[0]] = mapper_base[:, 1]
+    mapper[mapper_base.shape[0]:] = len(y_seq) + torch.arange(max_len - len(y_seq))
+    return mapper, alphas
+def get_refinement_mapper(prompts, tokenizer, max_len=77):
+    x_seq = prompts[0]
+    mappers, alphas = [], []
+    for i in range(1, len(prompts)):
+        mapper, alpha = get_mapper(x_seq, prompts[i], tokenizer, max_len)
+        mappers.append(mapper)
+        alphas.append(alpha)
+    return torch.stack(mappers), torch.stack(alphas)
+def get_word_inds(text: str, word_place: int, tokenizer):
+    split_text = text.split(" ")
+    if type(word_place) is str:
+        word_place = [i for i, word in enumerate(split_text) if word_place == word]
+    elif type(word_place) is int:
+        word_place = [word_place]
+    out = []
+    if len(word_place) > 0:
+        words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text)][1:-1]
+        cur_len, ptr = 0, 0
+        for i in range(len(words_encode)):
+            cur_len += len(words_encode[i])
+            if ptr in word_place:
+                out.append(i + 1)
+            if cur_len >= len(split_text[ptr]):
+                ptr += 1
+                cur_len = 0
+    return np.array(out)
+def get_replacement_mapper_(x: str, y: str, tokenizer, max_len=77):
+    words_x = x.split(' ')
+    words_y = y.split(' ')
+    if len(words_x) != len(words_y):
+        raise ValueError(f"attention replacement edit can only be applied on prompts with the same length"
+                         f" but prompt A has {len(words_x)} words and prompt B has {len(words_y)} words.")
+    inds_replace = [i for i in range(len(words_y)) if words_y[i] != words_x[i]]
+    inds_source = [get_word_inds(x, i, tokenizer) for i in inds_replace]
+    inds_target = [get_word_inds(y, i, tokenizer) for i in inds_replace]
+    mapper = np.zeros((max_len, max_len))
+    i = j = 0
+    cur_inds = 0
+    while i < max_len and j < max_len:
+        if cur_inds < len(inds_source) and inds_source[cur_inds][0] == i:
+            inds_source_, inds_target_ = inds_source[cur_inds], inds_target[cur_inds]
+            if len(inds_source_) == len(inds_target_):
+                mapper[inds_source_, inds_target_] = 1
+            else:
+                ratio = 1 / len(inds_target_)
+                for i_t in inds_target_:
+                    mapper[inds_source_, i_t] = ratio
+            cur_inds += 1
+            i += len(inds_source_)
+            j += len(inds_target_)
+        elif cur_inds < len(inds_source):
+            mapper[i, j] = 1
+            i += 1
+            j += 1
+        else:
+            mapper[j, j] = 1
+            i += 1
+            j += 1
+    return torch.from_numpy(mapper).float()
+def get_replacement_mapper(prompts, tokenizer, max_len=77):
+    x_seq = prompts[0]
+    mappers = []
+    for i in range(1, len(prompts)):
+        mapper = get_replacement_mapper_(x_seq, prompts[i], tokenizer, max_len)
+        mappers.append(mapper)
+    return torch.stack(mappers)