leditsplusplus

Running on A10G

multimodalart HF Staff Linoy Tsaban commited on Nov 30, 2023

Commit

345d7b4

1 Parent(s): 45e73ca

Update pipeline_semantic_stable_diffusion_img2img_solver.py (#9)

- Update pipeline_semantic_stable_diffusion_img2img_solver.py (4065064f7aab311c2f9705f66c0b0aa7669cfdac)
- Update app.py (24b22ad95d5e32ad4374733cf2b0acb5c0e13f26)

Co-authored-by: Linoy Tsaban <LinoyTsaban@users.noreply.huggingface.co>

Files changed (2) hide show

app.py +17 -14
pipeline_semantic_stable_diffusion_img2img_solver.py +6 -5

app.py CHANGED Viewed

@@ -35,9 +35,9 @@ def caption_image(input_image):
     generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
     return generated_caption, generated_caption
-def sample(zs, wts, attention_store, prompt_tar="", cfg_scale_tar=15, skip=36, eta=1):
     latents = wts[-1].expand(1, -1, -1, -1)
-    img, attention_store = pipe(
         prompt=prompt_tar,
         init_latents=latents,
         guidance_scale=cfg_scale_tar,
@@ -45,10 +45,10 @@ def sample(zs, wts, attention_store, prompt_tar="", cfg_scale_tar=15, skip=36, e
         # num_inference_steps=steps,
         # use_ddpm=True,
         # wts=wts.value,
-        attention_store = attention_store,
         zs=zs,
     )
-    return img.images[0], attention_store
 def reconstruct(
@@ -59,6 +59,7 @@ def reconstruct(
     wts,
     zs,
     attention_store,
     do_reconstruction,
     reconstruction,
     reconstruct_button,
@@ -79,8 +80,8 @@ def reconstruct(
             ):  # if image caption was not changed, run actual reconstruction
                 tar_prompt = ""
             latents = wts[-1].expand(1, -1, -1, -1)
-            reconstruction, attention_store = sample(
-                zs, wts, attention_store=attention_store, prompt_tar=tar_prompt, skip=skip, cfg_scale_tar=tar_cfg_scale
             )
             do_reconstruction = False
         return (
@@ -130,7 +131,7 @@ def load_and_invert(
 ## SEGA ##
 def edit(input_image,
-            wts, zs, attention_store,
             tar_prompt,
             image_caption,
             steps,
@@ -197,27 +198,27 @@ def edit(input_image,
       )
       latnets = wts[-1].expand(1, -1, -1, -1)
-      sega_out, attention_store = pipe(prompt=tar_prompt,
                           init_latents=latnets,
                           guidance_scale = tar_cfg_scale,
                           # num_images_per_prompt=1,
                           # num_inference_steps=steps,
                           # use_ddpm=True,
                           # wts=wts.value,
-                          zs=zs, attention_store=attention_store, **editing_args)
-      return sega_out.images[0], gr.update(visible=True), do_reconstruction, reconstruction, wts, zs, attention_store, do_inversion, show_share_button
     else: # if sega concepts were not added, performs regular ddpm sampling
       if do_reconstruction: # if ddpm sampling wasn't computed
-          pure_ddpm_img, attention_store = sample(zs, wts, attention_store=attention_store, prompt_tar=tar_prompt, skip=skip, cfg_scale_tar=tar_cfg_scale)
           reconstruction = pure_ddpm_img
           do_reconstruction = False
-          return pure_ddpm_img, gr.update(visible=False), do_reconstruction, reconstruction, wts, zs, attention_store, do_inversion, show_share_button
-      return reconstruction, gr.update(visible=False), do_reconstruction, reconstruction, wts, zs, attention_store, do_inversion, show_share_button
 def randomize_seed_fn(seed, is_random):
@@ -461,6 +462,7 @@ with gr.Blocks(css="style.css") as demo:
     wts = gr.State()
     zs = gr.State()
     attention_store=gr.State()
     reconstruction = gr.State()
     do_inversion = gr.State(value=True)
     do_reconstruction = gr.State(value=True)
@@ -697,6 +699,7 @@ with gr.Blocks(css="style.css") as demo:
         fn=edit,
         inputs=[input_image,
                 wts, zs, attention_store,
                 tar_prompt,
                 image_caption,
                 steps,
@@ -716,7 +719,7 @@ with gr.Blocks(css="style.css") as demo:
         ],
-        outputs=[sega_edited_image, reconstruct_button, do_reconstruction, reconstruction, wts, zs,attention_store, do_inversion, share_btn_container])
     # .success(fn=update_gallery_display, inputs= [prev_output_image, sega_edited_image], outputs = [gallery, gallery, prev_output_image])

     generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
     return generated_caption, generated_caption
+def sample(zs, wts, attention_store, text_cross_attention_maps, prompt_tar="", cfg_scale_tar=15, skip=36, eta=1):
     latents = wts[-1].expand(1, -1, -1, -1)
+    img, attention_store, text_cross_attention_maps = pipe(
         prompt=prompt_tar,
         init_latents=latents,
         guidance_scale=cfg_scale_tar,
         # num_inference_steps=steps,
         # use_ddpm=True,
         # wts=wts.value,
+        attention_store = attention_store, text_cross_attention_maps=text_cross_attention_maps,
         zs=zs,
     )
+    return img.images[0], attention_store, text_cross_attention_maps
 def reconstruct(
     wts,
     zs,
     attention_store,
+    text_cross_attention_maps,
     do_reconstruction,
     reconstruction,
     reconstruct_button,
             ):  # if image caption was not changed, run actual reconstruction
                 tar_prompt = ""
             latents = wts[-1].expand(1, -1, -1, -1)
+            reconstruction, attention_store, text_cross_attention_maps = sample(
+                zs, wts, attention_store=attention_store, text_cross_attention_maps=text_cross_attention_maps,prompt_tar=tar_prompt, skip=skip, cfg_scale_tar=tar_cfg_scale
             )
             do_reconstruction = False
         return (
 ## SEGA ##
 def edit(input_image,
+            wts, zs, attention_store, text_cross_attention_maps,
             tar_prompt,
             image_caption,
             steps,
       )
       latnets = wts[-1].expand(1, -1, -1, -1)
+      sega_out, attention_store, text_cross_attention_maps = pipe(prompt=tar_prompt,
                           init_latents=latnets,
                           guidance_scale = tar_cfg_scale,
                           # num_images_per_prompt=1,
                           # num_inference_steps=steps,
                           # use_ddpm=True,
                           # wts=wts.value,
+                          zs=zs, attention_store=attention_store, text_cross_attention_maps=text_cross_attention_maps, **editing_args)
+      return sega_out.images[0], gr.update(visible=True), do_reconstruction, reconstruction, wts, zs, attention_store, text_cross_attention_maps, do_inversion, show_share_button
     else: # if sega concepts were not added, performs regular ddpm sampling
       if do_reconstruction: # if ddpm sampling wasn't computed
+          pure_ddpm_img, attention_store, text_cross_attention_maps = sample(zs, wts, attention_store=attention_store, text_cross_attention_maps=text_cross_attention_maps, prompt_tar=tar_prompt, skip=skip, cfg_scale_tar=tar_cfg_scale)
           reconstruction = pure_ddpm_img
           do_reconstruction = False
+          return pure_ddpm_img, gr.update(visible=False), do_reconstruction, reconstruction, wts, zs, attention_store, text_cross_attention_maps, do_inversion, show_share_button
+      return reconstruction, gr.update(visible=False), do_reconstruction, reconstruction, wts, zs, attention_store, text_cross_attention_maps, do_inversion, show_share_button
 def randomize_seed_fn(seed, is_random):
     wts = gr.State()
     zs = gr.State()
     attention_store=gr.State()
+    text_cross_attention_maps = gr.State()
     reconstruction = gr.State()
     do_inversion = gr.State(value=True)
     do_reconstruction = gr.State(value=True)
         fn=edit,
         inputs=[input_image,
                 wts, zs, attention_store,
+                text_cross_attention_maps,
                 tar_prompt,
                 image_caption,
                 steps,
         ],
+        outputs=[sega_edited_image, reconstruct_button, do_reconstruction, reconstruction, wts, zs,attention_store, text_cross_attention_maps, do_inversion, share_btn_container])
     # .success(fn=update_gallery_display, inputs= [prev_output_image, sega_edited_image], outputs = [gallery, gallery, prev_output_image])

pipeline_semantic_stable_diffusion_img2img_solver.py CHANGED Viewed

@@ -500,6 +500,7 @@ class SemanticStableDiffusionImg2ImgPipeline_DPMSolver(DiffusionPipeline):
             use_cross_attn_mask: bool = False,
             # Attention store (just for visualization purposes)
             attention_store = None,
             attn_store_steps: Optional[List[int]] = [],
             store_averaged_over_steps: bool = True,
             use_intersect_mask: bool = False,
@@ -755,10 +756,10 @@ class SemanticStableDiffusionImg2ImgPipeline_DPMSolver(DiffusionPipeline):
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
-            self.text_cross_attention_maps = [org_prompt] if isinstance(org_prompt, str) else org_prompt
             if enable_edit_guidance:
                 text_embeddings = torch.cat([uncond_embeddings, text_embeddings, edit_concepts])
-                self.text_cross_attention_maps += \
                     ([editing_prompt] if isinstance(editing_prompt, str) else editing_prompt)
             else:
                 text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
@@ -920,11 +921,11 @@ class SemanticStableDiffusionImg2ImgPipeline_DPMSolver(DiffusionPipeline):
                         if use_cross_attn_mask:
                             out = attention_store.aggregate_attention(
                                 attention_maps=attention_store.step_store,
-                                prompts=self.text_cross_attention_maps,
                                 res=16,
                                 from_where=["up", "down"],
                                 is_cross=True,
-                                select=self.text_cross_attention_maps.index(editing_prompt[c]),
                             )
                             attn_map = out[:, :, :, 1:1 + num_edit_tokens[c]]  # 0 -> startoftext
@@ -1105,7 +1106,7 @@ class SemanticStableDiffusionImg2ImgPipeline_DPMSolver(DiffusionPipeline):
         if not return_dict:
             return (image, has_nsfw_concept), attention_store
-        return SemanticStableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept),  attention_store
     def encode_text(self, prompts):
         text_inputs = self.tokenizer(

             use_cross_attn_mask: bool = False,
             # Attention store (just for visualization purposes)
             attention_store = None,
+            text_cross_attention_maps = None,
             attn_store_steps: Optional[List[int]] = [],
             store_averaged_over_steps: bool = True,
             use_intersect_mask: bool = False,
             # For classifier free guidance, we need to do two forward passes.
             # Here we concatenate the unconditional and text embeddings into a single batch
             # to avoid doing two forward passes
+            text_cross_attention_maps = [org_prompt] if isinstance(org_prompt, str) else org_prompt
             if enable_edit_guidance:
                 text_embeddings = torch.cat([uncond_embeddings, text_embeddings, edit_concepts])
+                text_cross_attention_maps += \
                     ([editing_prompt] if isinstance(editing_prompt, str) else editing_prompt)
             else:
                 text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
                         if use_cross_attn_mask:
                             out = attention_store.aggregate_attention(
                                 attention_maps=attention_store.step_store,
+                                prompts=text_cross_attention_maps,
                                 res=16,
                                 from_where=["up", "down"],
                                 is_cross=True,
+                                select=text_cross_attention_maps.index(editing_prompt[c]),
                             )
                             attn_map = out[:, :, :, 1:1 + num_edit_tokens[c]]  # 0 -> startoftext
         if not return_dict:
             return (image, has_nsfw_concept), attention_store
+        return SemanticStableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept),  attention_store, text_cross_attention_maps
     def encode_text(self, prompts):
         text_inputs = self.tokenizer(