Spaces:

shgao
/

EditAnything

Runtime error

App Files Files Community

Shanghua Gao commited on Feb 7, 2024

Commit

78acc58

1 Parent(s): 5fb4baa

udpate

Browse files

Files changed (11) hide show

README.md +0 -1
annotator/util.py +23 -1
app.py +1 -1
editany_demo.py +157 -85
editany_lora.py +78 -44
editany_nogradio.py +20 -0
editany_test.py +1 -1
environment.yaml +38 -0
requirements.txt +1 -1
utils/stable_diffusion_controlnet_inpaint.py +26 -18
utils/stable_diffusion_reference.py +295 -326

README.md CHANGED Viewed

@@ -8,7 +8,6 @@ sdk_version: 3.35.2
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 # Edit Anything by Segment-Anything

 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 # Edit Anything by Segment-Anything

annotator/util.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import numpy as np
 import cv2
 import os
 annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts')
@@ -71,3 +71,25 @@ def get_bounding_box(mask):
     # Return as [xmin, ymin, xmax, ymax]
     return [rmin, cmin, rmax, cmax]

 import numpy as np
 import cv2
 import os
+import pickle
 annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts')
     # Return as [xmin, ymin, xmax, ymax]
     return [rmin, cmin, rmax, cmax]
+def save_input_to_file(func):
+    def wrapper(self, *args, **kwargs):
+        # 创建不包含 self 的输入副本
+        input_data = {
+            'args': args,
+            'kwargs': kwargs
+        }
+        # 执行原始函数
+        result = func(self, *args, **kwargs)
+        # 将输入数据保存到文件
+        with open('input_data.pkl', 'wb') as f:
+            pickle.dump(input_data, f)
+        # 返回结果
+        return result
+    return wrapper

app.py CHANGED Viewed

@@ -68,4 +68,4 @@ with gr.Blocks() as demo:
     with gr.Tabs():
         gr.Markdown(SHARED_UI_WARNING)
-demo.queue(api_open=False).launch(server_name='0.0.0.0', share=False)

     with gr.Tabs():
         gr.Markdown(SHARED_UI_WARNING)
+demo.queue(api_open=False).launch(server_name='0.0.0.0', share=False)

editany_demo.py CHANGED Viewed

@@ -1,6 +1,10 @@
 # Edit Anything trained with Stable Diffusion + ControlNet + SAM  + BLIP2
 import gradio as gr
 def create_demo_template(
     process,
@@ -22,7 +26,7 @@ def create_demo_template(
         ref_click_mask = gr.State(None)
         with gr.Row():
             gr.Markdown(INFO)
-        with gr.Row().style(equal_height=False):
             with gr.Column():
                 with gr.Tab("Click🖱"):
                     source_image_click = gr.Image(
@@ -40,12 +44,13 @@ def create_demo_template(
                                 interactive=True,
                                 show_label=False,
                             )
-                            clear_button_click = gr.Button(
-                                value="Clear Click Points", interactive=True
-                            )
-                            clear_button_image = gr.Button(
-                                value="Clear Image", interactive=True
-                            )
                         with gr.Row():
                             run_button_click = gr.Button(
                                 label="Run EditAnying", interactive=True
@@ -56,63 +61,75 @@ def create_demo_template(
                         label="Image: Upload an image and cover the region you want to edit with sketch",
                         type="numpy",
                         tool="sketch",
                     )
                     run_button = gr.Button(
                         label="Run EditAnying", interactive=True)
-                with gr.Column():
-                    enable_all_generate = gr.Checkbox(
-                        label="Auto generation on all region.", value=False
                     )
                     control_scale = gr.Slider(
-                        label="Mask Align strength",
-                        info="Large value -> strict alignment with SAM mask",
                         minimum=0,
                         maximum=1,
                         value=0.5,
                         step=0.1,
                     )
                 with gr.Column():
-                    enable_auto_prompt = gr.Checkbox(
-                        label="Auto generate text prompt from input image with BLIP2",
-                        info="Warning: Enable this may makes your prompt not working.",
-                        value=enable_auto_prompt_default,
-                    )
-                    a_prompt = gr.Textbox(
-                        label="Positive Prompt",
-                        info="Text in the expected things of edited region",
-                        value="best quality, extremely detailed,",
-                    )
-                    n_prompt = gr.Textbox(
-                        label="Negative Prompt",
-                        value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, NSFW",
-                    )
-                with gr.Row():
-                    num_samples = gr.Slider(
-                        label="Images", minimum=1, maximum=12, value=2, step=1
-                    )
-                    seed = gr.Slider(
-                        label="Seed",
-                        minimum=-1,
-                        maximum=2147483647,
-                        step=1,
-                        randomize=True,
-                    )
                 with gr.Row():
                     enable_tile = gr.Checkbox(
-                        label="Tile refinement for high resolution generation",
                         info="Slow inference",
                         value=True,
                     )
                     refine_alignment_ratio = gr.Slider(
-                        label="Alignment Strength",
-                        info="Large value -> strict alignment with input image. Small value -> strong global consistency",
                         minimum=0.0,
                         maximum=1.0,
                         value=0.95,
                         step=0.05,
                     )
-                with gr.Accordion("Reference options", open=False):
                     # ref_image = gr.Image(
                     #     source='upload', label="Upload a reference image", type="pil", value=None)
                     ref_image = gr.Image(
@@ -120,8 +137,9 @@ def create_demo_template(
                         label="Upload a reference image and cover the region you want to use with sketch",
                         type="pil",
                         tool="sketch",
                     )
-                    with gr.Column():
                         ref_auto_prompt = gr.Checkbox(
                             label="Ref. Auto Prompt", value=True
                         )
@@ -148,45 +166,25 @@ def create_demo_template(
                     with gr.Row():
                         reference_attn = gr.Checkbox(
                             label="reference_attn", value=True)
-                        attention_auto_machine_weight = gr.Slider(
-                            label="attention_weight",
-                            minimum=0,
-                            maximum=1.0,
-                            value=0.8,
-                            step=0.01,
                         )
                     with gr.Row():
-                        reference_adain = gr.Checkbox(
-                            label="reference_adain", value=False
                         )
-                        gn_auto_machine_weight = gr.Slider(
-                            label="gn_weight",
                             minimum=0,
                             maximum=1.0,
-                            value=0.1,
-                            step=0.01,
                         )
-                    style_fidelity = gr.Slider(
-                        label="Style fidelity",
-                        minimum=0,
-                        maximum=1.0,
-                        value=0.5,
-                        step=0.01,
-                    )
-                    ref_sam_scale = gr.Slider(
-                        label="SAM Control Scale",
-                        minimum=0,
-                        maximum=1.0,
-                        value=0.3,
-                        step=0.1,
-                    )
-                    ref_inpaint_scale = gr.Slider(
-                        label="Inpaint Control Scale",
-                        minimum=0,
-                        maximum=1.0,
-                        value=0.2,
-                        step=0.1,
-                    )
                     with gr.Row():
                         ref_textinv = gr.Checkbox(
                             label="Use textual inversion token", value=False
@@ -196,8 +194,37 @@ def create_demo_template(
                             info="Text in the inversion token path",
                             value=None,
                         )
-                with gr.Accordion("Advanced options", open=False):
                     mask_image = gr.Image(
                         source="upload",
                         label="Upload a predefined mask of edit region: Switch to Brush mode when using this!",
@@ -244,19 +271,16 @@ def create_demo_template(
                     )
             with gr.Column():
                 result_gallery_refine = gr.Gallery(
-                    label="Output High quality", show_label=True, elem_id="gallery"
-                ).style(grid=2, preview=False)
                 result_gallery_init = gr.Gallery(
-                    label="Output Low quality", show_label=True, elem_id="gallery"
-                ).style(grid=2, height="auto")
                 result_gallery_ref = gr.Gallery(
-                    label="Output Ref", show_label=False, elem_id="gallery"
-                ).style(grid=2, height="auto")
-                result_text = gr.Text(label="BLIP2+Human Prompt Text")
         ips = [
             source_image_brush,
-            enable_all_generate,
             mask_image,
             control_scale,
             enable_auto_prompt,
@@ -288,6 +312,7 @@ def create_demo_template(
             ref_auto_prompt,
             ref_textinv,
             ref_textinv_path,
         ]
         run_button.click(
             fn=process,
@@ -299,10 +324,56 @@ def create_demo_template(
                 result_text,
             ],
         )
         ip_click = [
             origin_image,
-            enable_all_generate,
             click_mask,
             control_scale,
             enable_auto_prompt,
@@ -334,6 +405,7 @@ def create_demo_template(
             ref_auto_prompt,
             ref_textinv,
             ref_textinv_path,
         ]
         run_button_click.click(

 # Edit Anything trained with Stable Diffusion + ControlNet + SAM  + BLIP2
 import gradio as gr
+import numpy as np
+import cv2
+from cv2 import imencode
+import base64
 def create_demo_template(
     process,
         ref_click_mask = gr.State(None)
         with gr.Row():
             gr.Markdown(INFO)
+        with gr.Row(equal_height=False):
             with gr.Column():
                 with gr.Tab("Click🖱"):
                     source_image_click = gr.Image(
                                 interactive=True,
                                 show_label=False,
                             )
+                            with gr.Row():
+                                clear_button_click = gr.Button(
+                                    value="Clear Points", interactive=True
+                                )
+                                clear_button_image = gr.Button(
+                                    value="Reset Image", interactive=True
+                                )
                         with gr.Row():
                             run_button_click = gr.Button(
                                 label="Run EditAnying", interactive=True
                         label="Image: Upload an image and cover the region you want to edit with sketch",
                         type="numpy",
                         tool="sketch",
+                        brush_color="#00FFBF"
                     )
                     run_button = gr.Button(
                         label="Run EditAnying", interactive=True)
+                with gr.Tab("All region"):
+                    source_image_clean = gr.Image(
+                        source="upload",
+                        label="Image: Upload an image",
+                        type="numpy",
                     )
+                    run_button_allregion = gr.Button(
+                        label="Run EditAnying", interactive=True)
+                with gr.Row():
+                    # enable_all_generate = gr.Checkbox(
+                    #     label="All Region Generation", value=False
+                    # )
                     control_scale = gr.Slider(
+                        label="SAM Mask Alignment Strength",
+                        # info="Large value -> strict alignment with SAM mask",
                         minimum=0,
                         maximum=1,
                         value=0.5,
                         step=0.1,
                     )
+                    with gr.Row():
+                        num_samples = gr.Slider(
+                            label="Images", minimum=1, maximum=12, value=2, step=1
+                        )
+                        seed = gr.Slider(
+                            label="Seed",
+                            minimum=-1,
+                            maximum=2147483647,
+                            step=1,
+                            randomize=True,
+                        )
                 with gr.Column():
+                    with gr.Row():
+                        enable_auto_prompt = gr.Checkbox(
+                            label="Prompt Auto Generation (Enable this may makes your prompt not working)",
+                            # info="",
+                            value=enable_auto_prompt_default,
+                        )
+                    with gr.Row():
+                        a_prompt = gr.Textbox(
+                            label="Positive Prompt",
+                            info="Text in the expected things of edited region",
+                            value="best quality, extremely detailed,",
+                        )
+                        n_prompt = gr.Textbox(
+                            label="Negative Prompt",
+                            value="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, NSFW",
+                        )
                 with gr.Row():
                     enable_tile = gr.Checkbox(
+                        label="High-resolution Refinement",
                         info="Slow inference",
                         value=True,
                     )
                     refine_alignment_ratio = gr.Slider(
+                        label="Similarity with Initial Results",
+                        # info="Large value -> strict alignment with input image. Small value -> strong global consistency",
                         minimum=0.0,
                         maximum=1.0,
                         value=0.95,
                         step=0.05,
                     )
+                with gr.Accordion("Cross-image Drag Options", open=False):
                     # ref_image = gr.Image(
                     #     source='upload', label="Upload a reference image", type="pil", value=None)
                     ref_image = gr.Image(
                         label="Upload a reference image and cover the region you want to use with sketch",
                         type="pil",
                         tool="sketch",
+                        brush_color="#00FFBF",
                     )
+                    with gr.Row():
                         ref_auto_prompt = gr.Checkbox(
                             label="Ref. Auto Prompt", value=True
                         )
                     with gr.Row():
                         reference_attn = gr.Checkbox(
                             label="reference_attn", value=True)
+                        reference_adain = gr.Checkbox(
+                            label="reference_adain", value=True
                         )
                     with gr.Row():
+                        ref_sam_scale = gr.Slider(
+                            label="Pos Control Scale",
+                            minimum=0,
+                            maximum=1.0,
+                            value=0.3,
+                            step=0.1,
                         )
+                        ref_inpaint_scale = gr.Slider(
+                            label="Content Control Scale",
                             minimum=0,
                             maximum=1.0,
+                            value=0.2,
+                            step=0.1,
                         )
                     with gr.Row():
                         ref_textinv = gr.Checkbox(
                             label="Use textual inversion token", value=False
                             info="Text in the inversion token path",
                             value=None,
                         )
+                    with gr.Accordion("Advanced options", open=False):
+                        style_fidelity = gr.Slider(
+                            label="Style fidelity",
+                            minimum=0,
+                            maximum=1.,
+                            value=0.,
+                            step=0.1,
+                        )
+                        attention_auto_machine_weight = gr.Slider(
+                            label="Attention Reference Weight",
+                            minimum=0,
+                            maximum=1.0,
+                            value=1.0,
+                            step=0.01,
+                        )
+                        gn_auto_machine_weight = gr.Slider(
+                            label="GroupNorm Reference Weight",
+                            minimum=0,
+                            maximum=1.0,
+                            value=1.0,
+                            step=0.01,
+                        )
+                        ref_scale = gr.Slider(
+                            label="Frequency Reference Guidance Scale",
+                            minimum=0,
+                            maximum=1.0,
+                            value=0.0,
+                            step=0.1,
+                        )
+                with gr.Accordion("Advanced Options", open=False):
                     mask_image = gr.Image(
                         source="upload",
                         label="Upload a predefined mask of edit region: Switch to Brush mode when using this!",
                     )
             with gr.Column():
                 result_gallery_refine = gr.Gallery(
+                    label="Output High quality", show_label=True, elem_id="gallery", preview=False)
                 result_gallery_init = gr.Gallery(
+                    label="Output Low quality", show_label=True, elem_id="gallery", height="auto")
                 result_gallery_ref = gr.Gallery(
+                    label="Output Ref", show_label=False, elem_id="gallery", height="auto")
+                result_text = gr.Text(label="ALL Prompt Text")
         ips = [
             source_image_brush,
+            gr.State(False),  # enable_all_generate
             mask_image,
             control_scale,
             enable_auto_prompt,
             ref_auto_prompt,
             ref_textinv,
             ref_textinv_path,
+            ref_scale,
         ]
         run_button.click(
             fn=process,
                 result_text,
             ],
         )
+        ips_allregion = [
+            source_image_clean,
+            gr.State(True),  # enable_all_generate
+            mask_image,
+            control_scale,
+            enable_auto_prompt,
+            a_prompt,
+            n_prompt,
+            num_samples,
+            image_resolution,
+            detect_resolution,
+            ddim_steps,
+            guess_mode,
+            scale,
+            seed,
+            eta,
+            enable_tile,
+            refine_alignment_ratio,
+            refine_image_resolution,
+            alpha_weight,
+            use_scale_map,
+            condition_model,
+            ref_image,
+            attention_auto_machine_weight,
+            gn_auto_machine_weight,
+            style_fidelity,
+            reference_attn,
+            reference_adain,
+            ref_prompt,
+            ref_sam_scale,
+            ref_inpaint_scale,
+            ref_auto_prompt,
+            ref_textinv,
+            ref_textinv_path,
+            ref_scale,
+        ]
+        run_button_allregion.click(
+            fn=process,
+            inputs=ips_allregion,
+            outputs=[
+                result_gallery_refine,
+                result_gallery_init,
+                result_gallery_ref,
+                result_text,
+            ],
+        )
         ip_click = [
             origin_image,
+            gr.State(False),  # enable_all_generate
             click_mask,
             control_scale,
             enable_auto_prompt,
             ref_auto_prompt,
             ref_textinv,
             ref_textinv_path,
+            ref_scale,
         ]
         run_button_click.click(

editany_lora.py CHANGED Viewed

@@ -14,7 +14,7 @@ import random
 import os
 import requests
 from io import BytesIO
-from annotator.util import resize_image, HWC3, resize_points, get_bounding_box
 import torch
 from safetensors.torch import load_file
@@ -28,8 +28,7 @@ from utils.stable_diffusion_controlnet_inpaint import StableDiffusionControlNetI
 # need the latest transformers
 # pip install git+https://github.com/huggingface/transformers.git
 from transformers import AutoProcessor, Blip2ForConditionalGeneration
-from diffusers import ControlNetModel, DiffusionPipeline
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 import PIL.Image
 # Segment-Anything init.
@@ -119,16 +118,55 @@ def get_pipeline_embeds(pipeline, prompt, negative_prompt, device):
     """
     max_length = pipeline.tokenizer.model_max_length
-    # simple way to determine length of tokens
-    count_prompt = len(re.split(r", ", prompt))
-    count_negative_prompt = len(re.split(r", ", negative_prompt))
-    # create the tensor based on which prompt is longer
-    if count_prompt >= count_negative_prompt:
-        input_ids = pipeline.tokenizer(
             prompt, return_tensors="pt", truncation=False
         ).input_ids.to(device)
-        shape_max_length = input_ids.shape[-1]
         negative_ids = pipeline.tokenizer(
             negative_prompt,
             truncation=False,
@@ -137,23 +175,21 @@ def get_pipeline_embeds(pipeline, prompt, negative_prompt, device):
             return_tensors="pt",
         ).input_ids.to(device)
     else:
-        negative_ids = pipeline.tokenizer(
-            negative_prompt, return_tensors="pt", truncation=False
-        ).input_ids.to(device)
-        shape_max_length = negative_ids.shape[-1]
         input_ids = pipeline.tokenizer(
-            prompt,
-            return_tensors="pt",
-            truncation=False,
-            padding="max_length",
-            max_length=shape_max_length,
-        ).input_ids.to(device)
     concat_embeds = []
     neg_embeds = []
     for i in range(0, shape_max_length, max_length):
-        concat_embeds.append(pipeline.text_encoder(input_ids[:, i : i + max_length])[0])
-        neg_embeds.append(pipeline.text_encoder(negative_ids[:, i : i + max_length])[0])
     return torch.cat(concat_embeds, dim=1), torch.cat(neg_embeds, dim=1)
@@ -178,10 +214,12 @@ def load_lora_weights(pipeline, checkpoint_path, multiplier, device, dtype):
         for layer, elems in updates.items():
             if "text" in layer:
-                layer_infos = layer.split(LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
                 curr_layer = pipeline.text_encoder
             else:
-                layer_infos = layer.split(LORA_PREFIX_UNET + "_")[-1].split("_")
                 curr_layer = pipeline.unet
             # find the target layer
@@ -244,7 +282,8 @@ def load_lora_weights(pipeline, checkpoint_path, multiplier, device, dtype):
                     )
                     curr_layer = pipeline.text_encoder
                 else:
-                    layer_infos = layer.split(LORA_PREFIX_UNET + "_")[-1].split("_")
                     curr_layer = pipeline.unet
                 # find the target layer
@@ -489,7 +528,7 @@ class EditAnythingLoraModel:
         self.mask_predictor.set_image(image)
         # Separate the points and labels
         points, labels = zip(*[(point[:2], point[2])
-                             for point in clicked_points])
         # Convert the points and labels to numpy arrays
         input_point = np.array(points)
@@ -534,7 +573,8 @@ class EditAnythingLoraModel:
         mask_click_np = np.transpose(mask_click_np, (1, 2, 0)) * 255.0
         mask_image = HWC3(mask_click_np.astype(np.uint8))
-        mask_image = cv2.resize(mask_image, (W, H), interpolation=cv2.INTER_LINEAR)
         # mask_image = Image.fromarray(mask_image_tmp)
         # Draw circles for all clicked points
@@ -567,6 +607,7 @@ class EditAnythingLoraModel:
         )
     @torch.inference_mode()
     def process(
         self,
         source_image,
@@ -602,6 +643,7 @@ class EditAnythingLoraModel:
         ref_auto_prompt=False,
         ref_textinv=True,
         ref_textinv_path=None,
     ):
         if condition_model is None or condition_model == "EditAnything":
@@ -624,14 +666,9 @@ class EditAnythingLoraModel:
                 )
                 self.defalut_enable_all_generate = enable_all_generate
             if enable_all_generate:
-                print(
-                    "source_image",
-                    source_image["mask"].shape,
-                    input_image.shape,
-                )
                 mask_image = (
                     np.ones((input_image.shape[0],
-                            input_image.shape[1], 3)) * 255
                 )
             else:
                 mask_image = source_image["mask"]
@@ -699,11 +736,13 @@ class EditAnythingLoraModel:
                 except:
                     print("No textinvert embeddings found.")
                     ref_data_path = "./utils/tmp/textinv/img"
-                    if not os.path.exists(ref_data_path):
                         os.makedirs(ref_data_path)
-                    cropped_ref_image.save(os.path.join(ref_data_path, 'ref.png'))
                     print("Ref image region is save to:", ref_data_path)
-                    print("Plese finetune with run_texutal_inversion.sh in utils folder to get the textinvert embeddings.")
         else:
             ref_mask = None
@@ -735,7 +774,7 @@ class EditAnythingLoraModel:
             )
             control = torch.from_numpy(detected_map.copy()).float().cuda()
-            control = torch.stack([control for _ in range(num_samples)], dim=0)
             control = einops.rearrange(control, "b h w c -> b c h w").clone()
             mask_imag_ori = HWC3(mask_image.astype(np.uint8))
@@ -753,14 +792,8 @@ class EditAnythingLoraModel:
             prompt_embeds, negative_prompt_embeds = get_pipeline_embeds(
                 self.pipe, postive_prompt, negative_prompt, "cuda"
             )
-            prompt_embeds = torch.cat([prompt_embeds] * num_samples, dim=0)
-            negative_prompt_embeds = torch.cat(
-                [negative_prompt_embeds] * num_samples, dim=0
-            )
             if enable_all_generate and self.extra_inpaint:
-                self.pipe.safety_checker = lambda images, clip_input: (
-                    images, False)
                 if ref_image is not None:
                     print("Not support yet.")
                     return
@@ -845,6 +878,7 @@ class EditAnythingLoraModel:
                         reference_adain=reference_adain,
                         ref_controlnet_conditioning_scale=ref_multi_condition_scale,
                         guess_mode=guess_mode,
                     ).images
             results = [x_samples[i] for i in range(num_samples)]

 import os
 import requests
 from io import BytesIO
+from annotator.util import resize_image, HWC3, resize_points, get_bounding_box, save_input_to_file
 import torch
 from safetensors.torch import load_file
 # need the latest transformers
 # pip install git+https://github.com/huggingface/transformers.git
 from transformers import AutoProcessor, Blip2ForConditionalGeneration
+from diffusers import ControlNetModel
 import PIL.Image
 # Segment-Anything init.
     """
     max_length = pipeline.tokenizer.model_max_length
+    # # simple way to determine length of tokens
+    # count_prompt = len(re.split(r",", prompt))
+    # count_negative_prompt = len(re.split(r",", negative_prompt))
+    # # create the tensor based on which prompt is longer
+    # if count_prompt >= count_negative_prompt:
+    #     input_ids = pipeline.tokenizer(
+    #         prompt, return_tensors="pt", truncation=False
+    #     ).input_ids.to(device)
+    #     shape_max_length = input_ids.shape[-1]
+    #     negative_ids = pipeline.tokenizer(
+    #         negative_prompt,
+    #         truncation=False,
+    #         padding="max_length",
+    #         max_length=shape_max_length,
+    #         return_tensors="pt",
+    #     ).input_ids.to(device)
+    # else:
+    #     negative_ids = pipeline.tokenizer(
+    #         negative_prompt, return_tensors="pt", truncation=False
+    #     ).input_ids.to(device)
+    #     shape_max_length = negative_ids.shape[-1]
+    #     input_ids = pipeline.tokenizer(
+    #         prompt,
+    #         return_tensors="pt",
+    #         truncation=False,
+    #         padding="max_length",
+    #         max_length=shape_max_length,
+    #     ).input_ids.to(device)
+    # concat_embeds = []
+    # neg_embeds = []
+    # for i in range(0, shape_max_length, max_length):
+    #     concat_embeds.append(pipeline.text_encoder(
+    #         input_ids[:, i: i + max_length])[0])
+    #     neg_embeds.append(pipeline.text_encoder(
+    #         negative_ids[:, i: i + max_length])[0])
+    input_ids = pipeline.tokenizer(
             prompt, return_tensors="pt", truncation=False
         ).input_ids.to(device)
+    negative_ids = pipeline.tokenizer(
+            negative_prompt, return_tensors="pt", truncation=False
+        ).input_ids.to(device)
+    shape_max_length = max(input_ids.shape[-1],negative_ids.shape[-1])
+    if input_ids.shape[-1]>negative_ids.shape[-1]:
         negative_ids = pipeline.tokenizer(
             negative_prompt,
             truncation=False,
             return_tensors="pt",
         ).input_ids.to(device)
     else:
         input_ids = pipeline.tokenizer(
+        prompt,
+        return_tensors="pt",
+        truncation=False,
+        padding="max_length",
+        max_length=shape_max_length,
+    ).input_ids.to(device)
     concat_embeds = []
     neg_embeds = []
     for i in range(0, shape_max_length, max_length):
+        concat_embeds.append(pipeline.text_encoder(
+            input_ids[:, i: i + max_length])[0])
+        neg_embeds.append(pipeline.text_encoder(
+            negative_ids[:, i: i + max_length])[0])
     return torch.cat(concat_embeds, dim=1), torch.cat(neg_embeds, dim=1)
         for layer, elems in updates.items():
             if "text" in layer:
+                layer_infos = layer.split(
+                    LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
                 curr_layer = pipeline.text_encoder
             else:
+                layer_infos = layer.split(
+                    LORA_PREFIX_UNET + "_")[-1].split("_")
                 curr_layer = pipeline.unet
             # find the target layer
                     )
                     curr_layer = pipeline.text_encoder
                 else:
+                    layer_infos = layer.split(
+                        LORA_PREFIX_UNET + "_")[-1].split("_")
                     curr_layer = pipeline.unet
                 # find the target layer
         self.mask_predictor.set_image(image)
         # Separate the points and labels
         points, labels = zip(*[(point[:2], point[2])
+                               for point in clicked_points])
         # Convert the points and labels to numpy arrays
         input_point = np.array(points)
         mask_click_np = np.transpose(mask_click_np, (1, 2, 0)) * 255.0
         mask_image = HWC3(mask_click_np.astype(np.uint8))
+        mask_image = cv2.resize(
+            mask_image, (W, H), interpolation=cv2.INTER_LINEAR)
         # mask_image = Image.fromarray(mask_image_tmp)
         # Draw circles for all clicked points
         )
     @torch.inference_mode()
+    @save_input_to_file  # for debug use
     def process(
         self,
         source_image,
         ref_auto_prompt=False,
         ref_textinv=True,
         ref_textinv_path=None,
+        ref_scale=None,
     ):
         if condition_model is None or condition_model == "EditAnything":
                 )
                 self.defalut_enable_all_generate = enable_all_generate
             if enable_all_generate:
                 mask_image = (
                     np.ones((input_image.shape[0],
+                             input_image.shape[1], 3)) * 255
                 )
             else:
                 mask_image = source_image["mask"]
                 except:
                     print("No textinvert embeddings found.")
                     ref_data_path = "./utils/tmp/textinv/img"
+                    if not os.path.exists(ref_data_path):
                         os.makedirs(ref_data_path)
+                    cropped_ref_image.save(
+                        os.path.join(ref_data_path, 'ref.png'))
                     print("Ref image region is save to:", ref_data_path)
+                    print(
+                        "Plese finetune with run_texutal_inversion.sh in utils folder to get the textinvert embeddings.")
         else:
             ref_mask = None
             )
             control = torch.from_numpy(detected_map.copy()).float().cuda()
+            control = control.unsqueeze(dim=0)
             control = einops.rearrange(control, "b h w c -> b c h w").clone()
             mask_imag_ori = HWC3(mask_image.astype(np.uint8))
             prompt_embeds, negative_prompt_embeds = get_pipeline_embeds(
                 self.pipe, postive_prompt, negative_prompt, "cuda"
             )
             if enable_all_generate and self.extra_inpaint:
                 if ref_image is not None:
                     print("Not support yet.")
                     return
                         reference_adain=reference_adain,
                         ref_controlnet_conditioning_scale=ref_multi_condition_scale,
                         guess_mode=guess_mode,
+                        ref_scale=ref_scale,
                     ).images
             results = [x_samples[i] for i in range(num_samples)]

editany_nogradio.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import pickle
+from editany_lora import EditAnythingLoraModel
+model = EditAnythingLoraModel(
+    base_model_path="runwayml/stable-diffusion-v1-5",
+    controlmodel_name='LAION Pretrained(v0-4)-SD15',
+    lora_model_path=None, use_blip=False, extra_inpaint=True,
+)
+with open('input_data.pkl', 'rb') as f:
+    input_data = pickle.load(f)
+print(input_data)
+refined, output, ref, text = model.process(*input_data['args'], **input_data['kwargs'])
+output
+# a woman in a tan suit and white shirt
+# best quality, extremely detailed,iron man wallpaper

editany_test.py CHANGED Viewed

@@ -70,4 +70,4 @@ if __name__ == "__main__":
         lora_weight=0.5,
     )
     demo = create_demo(model.process, model.process_image_click)
-    demo.queue().launch(server_name="0.0.0.0")

         lora_weight=0.5,
     )
     demo = create_demo(model.process, model.process_image_click)
+    demo.queue().launch(server_name="0.0.0.0", share=True)

environment.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+name: control
+channels:
+  - pytorch
+  - defaults
+dependencies:
+  - python=3.8.5
+  - pip=20.3
+  - cudatoolkit=11.3
+  - pytorch=1.13.1
+  - torchvision=0.14.1
+  - numpy=1.23.1
+  - pip:
+      - gradio==3.35.2
+      - albumentations==1.3.0
+      - opencv-contrib-python==4.3.0.36
+      - imageio==2.9.0
+      - imageio-ffmpeg==0.4.2
+      - pytorch-lightning==1.5.0
+      - omegaconf==2.1.1
+      - test-tube>=0.7.5
+      - streamlit==1.12.1
+      - einops==0.3.0
+      - webdataset==0.2.5
+      - kornia==0.6
+      - open_clip_torch==2.0.2
+      - invisible-watermark>=0.1.5
+      - streamlit-drawable-canvas==0.8.0
+      - torchmetrics==0.6.0
+      - timm==0.6.12
+      - addict==2.4.0
+      - yapf==0.32.0
+      - prettytable==3.6.0
+      - safetensors==0.2.7
+      - basicsr==1.4.2
+      - diffusers==0.17.1
+      - accelerate==0.17.0
+      - transformers==4.30.2
+      - xformers

requirements.txt CHANGED Viewed

@@ -30,4 +30,4 @@ transformers==4.30.2
 xformers==0.0.16
 triton
 gradio==3.35.2
-gradio-client==0.2.7

 xformers==0.0.16
 triton
 gradio==3.35.2
+gradio-client==0.2.7

utils/stable_diffusion_controlnet_inpaint.py CHANGED Viewed

@@ -1179,6 +1179,7 @@ class StableDiffusionControlNetInpaintPipeline(
         style_fidelity: float = 0.5,
         reference_attn: bool = True,
         reference_adain: bool = True,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -1272,6 +1273,8 @@ class StableDiffusionControlNetInpaintPipeline(
                 Whether to use reference query for self attention's context.
             reference_adain (`bool`):
                 Whether to use reference adain.
         Examples:
@@ -1346,8 +1349,9 @@ class StableDiffusionControlNetInpaintPipeline(
             ref_prompt_embeds = self._encode_prompt(
                 ref_prompt,
                 device,
-                num_images_per_prompt * 2,
-                do_classifier_free_guidance,
                 negative_prompt="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
                 prompt_embeds=None,
             )
@@ -1414,13 +1418,13 @@ class StableDiffusionControlNetInpaintPipeline(
                 num_images_per_prompt=num_images_per_prompt,
                 device=device,
                 dtype=self.controlnet.dtype,
-                do_classifier_free_guidance=do_classifier_free_guidance,
             )
             ref_controlnet_conditioning_image = controlnet_conditioning_image.copy()
             ref_controlnet_conditioning_image[-1] = ref_control_image
-            # ref_controlnet_conditioning_scale = controlnet_conditioning_scale.copy()
-            # ref_controlnet_conditioning_scale[0] = 1.0 # disable the first sam controlnet
-            # ref_controlnet_conditioning_scale[-1] = 0.2
         # 5. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
@@ -1491,7 +1495,7 @@ class StableDiffusionControlNetInpaintPipeline(
                 prompt_embeds.dtype,
                 device,
                 generator,
-                do_classifier_free_guidance,
             )
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
@@ -1511,6 +1515,7 @@ class StableDiffusionControlNetInpaintPipeline(
             self.gn_auto_machine_weight = gn_auto_machine_weight
             self.do_classifier_free_guidance = do_classifier_free_guidance
             self.style_fidelity = style_fidelity
             self.ref_mask = ref_mask
             self.inpaint_mask = mask_image
             attn_modules, gn_modules = self.redefine_ref_model(
@@ -1518,9 +1523,16 @@ class StableDiffusionControlNetInpaintPipeline(
             )
             control_attn_modules, control_gn_modules = self.redefine_ref_model(
-                self.controlnet, reference_attn, False, model_type="controlnet"
             )
         # 8. Denoising loop
         num_warmup_steps = len(timesteps) - \
             num_inference_steps * self.scheduler.order
@@ -1549,12 +1561,6 @@ class StableDiffusionControlNetInpaintPipeline(
                 if ref_image is not None:  # for ref_only mode
                     # ref only part
-                    noise = randn_tensor(
-                        ref_image_latents.shape,
-                        generator=generator,
-                        device=ref_image_latents.device,
-                        dtype=ref_image_latents.dtype,
-                    )
                     ref_xt = self.scheduler.add_noise(
                         ref_image_latents,
                         noise,
@@ -1566,8 +1572,8 @@ class StableDiffusionControlNetInpaintPipeline(
                     MODE = "write"
                     self.change_module_mode(
-                        MODE, control_attn_modules, control_gn_modules
-                    )
                     (
                         ref_down_block_res_samples,
@@ -1582,7 +1588,6 @@ class StableDiffusionControlNetInpaintPipeline(
                         return_dict=False,
                     )
-                    self.change_module_mode(MODE, attn_modules, gn_modules)
                     self.unet(
                         ref_xt,
                         t,
@@ -1595,7 +1600,10 @@ class StableDiffusionControlNetInpaintPipeline(
                     # predict the noise residual
                     MODE = "read"  # change to read mode for following noise_pred
                     self.change_module_mode(MODE, attn_modules, gn_modules)
                 down_block_res_samples, mid_block_res_sample = self.controlnet(
                     non_inpainting_latent_model_input,
                     t,

         style_fidelity: float = 0.5,
         reference_attn: bool = True,
         reference_adain: bool = True,
+        ref_scale: float = 1.0,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
                 Whether to use reference query for self attention's context.
             reference_adain (`bool`):
                 Whether to use reference adain.
+            ref_scale (`float`):
+                reference guidance scale.
         Examples:
             ref_prompt_embeds = self._encode_prompt(
                 ref_prompt,
                 device,
+                # num_images_per_prompt * 2,
+                num_images_per_prompt * 1,
+                False,
                 negative_prompt="longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality",
                 prompt_embeds=None,
             )
                 num_images_per_prompt=num_images_per_prompt,
                 device=device,
                 dtype=self.controlnet.dtype,
+                do_classifier_free_guidance=False,
             )
             ref_controlnet_conditioning_image = controlnet_conditioning_image.copy()
+            for i in range(len(ref_controlnet_conditioning_image)):
+                ref_controlnet_conditioning_image[i] = ref_controlnet_conditioning_image[i].chunk(
+                    2)[0]  # remove the extra guidance for cfg
             ref_controlnet_conditioning_image[-1] = ref_control_image
         # 5. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
                 prompt_embeds.dtype,
                 device,
                 generator,
+                False,
             )
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
             self.gn_auto_machine_weight = gn_auto_machine_weight
             self.do_classifier_free_guidance = do_classifier_free_guidance
             self.style_fidelity = style_fidelity
+            self.ref_scale = ref_scale
             self.ref_mask = ref_mask
             self.inpaint_mask = mask_image
             attn_modules, gn_modules = self.redefine_ref_model(
             )
             control_attn_modules, control_gn_modules = self.redefine_ref_model(
+                self.controlnet, reference_attn, reference_adain, model_type="controlnet"
+            )
+        if ref_image is not None:
+            noise = randn_tensor(
+                # ref_image_latents.shape,
+                latents.shape,
+                generator=generator,
+                device=ref_image_latents.device,
+                dtype=ref_image_latents.dtype,
             )
         # 8. Denoising loop
         num_warmup_steps = len(timesteps) - \
             num_inference_steps * self.scheduler.order
                 if ref_image is not None:  # for ref_only mode
                     # ref only part
                     ref_xt = self.scheduler.add_noise(
                         ref_image_latents,
                         noise,
                     MODE = "write"
                     self.change_module_mode(
+                        MODE, control_attn_modules, control_gn_modules)
+                    self.change_module_mode(MODE, attn_modules, gn_modules)
                     (
                         ref_down_block_res_samples,
                         return_dict=False,
                     )
                     self.unet(
                         ref_xt,
                         t,
                     # predict the noise residual
                     MODE = "read"  # change to read mode for following noise_pred
+                    self.change_module_mode(
+                        MODE, control_attn_modules, control_gn_modules)
                     self.change_module_mode(MODE, attn_modules, gn_modules)
                 down_block_res_samples, mid_block_res_sample = self.controlnet(
                     non_inpainting_latent_model_input,
                     t,

utils/stable_diffusion_reference.py CHANGED Viewed

@@ -1,12 +1,12 @@
 # Based on https://raw.githubusercontent.com/okotaku/diffusers/feature/reference_only_control/examples/community/stable_diffusion_reference.py
 # Inspired by: https://github.com/Mikubill/sd-webui-controlnet/discussions/1236
 from typing import Any, Callable, Dict, List, Optional, Union, Tuple
 import numpy as np
 import PIL.Image
 import torch
-from diffusers import StableDiffusionPipeline
 from diffusers.models.attention import BasicTransformerBlock
 from diffusers.models.unet_2d_blocks import (
     CrossAttnDownBlock2D,
@@ -14,11 +14,9 @@ from diffusers.models.unet_2d_blocks import (
     DownBlock2D,
     UpBlock2D,
 )
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.utils import PIL_INTERPOLATION, logging
 import torch.nn.functional as F
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 EXAMPLE_DOC_STRING = """
@@ -56,6 +54,127 @@ def torch_dfs(model: torch.nn.Module):
     return result
 class StableDiffusionReferencePipeline:
     def prepare_ref_image(
         self,
@@ -237,9 +356,8 @@ class StableDiffusionReferencePipeline:
                         this_ref_mask = F.interpolate(
                             this_ref_mask, scale_factor=ref_scale
                         )
-                        # print("this_ref_mask",this_ref_mask.shape)
-                        # this_ref_mask = this_ref_mask.view(1,-1,1)
                         this_ref_mask = this_ref_mask.repeat(
                             resize_norm_hidden_states.shape[0],
                             resize_norm_hidden_states.shape[1],
@@ -256,11 +374,14 @@ class StableDiffusionReferencePipeline:
                                 -1,
                             )
                         )
                         masked_norm_hidden_states = masked_norm_hidden_states.permute(
                             0, 2, 1
                         )
                         self.bank.append(masked_norm_hidden_states)
-                        # self.bank.append(norm_hidden_states.detach().clone())
                     attn_output = self.attn1(
                         norm_hidden_states,
                         encoder_hidden_states=encoder_hidden_states
@@ -271,31 +392,27 @@ class StableDiffusionReferencePipeline:
                     )
                 if self.MODE == "read":
                     if self.attention_auto_machine_weight > self.attn_weight:
-                        # scale_ratio = ((self.ref_mask.shape[2] * self.ref_mask.shape[3])/norm_hidden_states.shape[1])**0.5
-                        # print(scale_ratio)
-                        # this_ref_mask = F.interpolate(self.ref_mask.to(norm_hidden_states.device), scale_factor=1/scale_ratio).view(1,1,-1)
-                        # print("resized mask", this_ref_mask.shape, this_ref_mask.max(), this_ref_mask.min(), this_ref_mask.sum())
-                        # ref_hidden_states = torch.cat([norm_hidden_states] + self.bank, dim=1)
-                        # if attention_mask is None:
-                        #     attention_mask = torch.ones(
-                        #             norm_hidden_states.shape[0], norm_hidden_states.shape[1], ref_hidden_states.shape[1], dtype=norm_hidden_states.dtype, device=norm_hidden_states.device
-                        #         )
-                        # this_ref_mask = this_ref_mask.repeat(norm_hidden_states.shape[0], norm_hidden_states.shape[1], 1)
-                        # this_ref_mask = torch.zeros(
-                        #             norm_hidden_states.shape[0], norm_hidden_states.shape[1], this_ref_mask.shape[1], dtype=norm_hidden_states.dtype, device=norm_hidden_states.device
-                        #         )
-                        # print(attention_mask.shape, this_ref_mask.shape)
-                        # attention_mask = torch.cat((attention_mask, this_ref_mask), dim=-1)
-                        # print("merge", attention_mask.shape)
                         ref_hidden_states = torch.cat(
-                            [norm_hidden_states] + self.bank, dim=1
                         )
                         attn_output_uc = self.attn1(
-                            norm_hidden_states,
                             encoder_hidden_states=ref_hidden_states,
-                            # attention_mask=attention_mask,
                             **cross_attention_kwargs,
                         )
                         attn_output_c = attn_output_uc.clone()
                         if self.do_classifier_free_guidance and self.style_fidelity > 0:
                             attn_output_c[self.uc_mask] = self.attn1(
@@ -308,6 +425,9 @@ class StableDiffusionReferencePipeline:
                             + (1.0 - self.style_fidelity) * attn_output_uc
                         )
                         self.bank.clear()
                     else:
                         attn_output = self.attn1(
                             norm_hidden_states,
@@ -317,6 +437,9 @@ class StableDiffusionReferencePipeline:
                             attention_mask=attention_mask,
                             **cross_attention_kwargs,
                         )
             if self.use_ada_layer_norm_zero:
                 attn_output = gate_msa.unsqueeze(1) * attn_output
             hidden_states = attn_output + hidden_states
@@ -365,6 +488,10 @@ class StableDiffusionReferencePipeline:
                     this_ref_mask = F.interpolate(
                         self.ref_mask.to(x.device), scale_factor=1 / scale_ratio
                     )
                     this_ref_mask = this_ref_mask.repeat(
                         x.shape[0], x.shape[1], 1, 1
                     ).bool()
@@ -378,8 +505,8 @@ class StableDiffusionReferencePipeline:
                         masked_x, dim=(2, 3), keepdim=True, correction=0
                     )
-                    self.mean_bank.append(mean)
-                    self.var_bank.append(var)
             if self.MODE == "read":
                 if (
                     self.gn_auto_machine_weight >= self.gn_weight
@@ -387,37 +514,12 @@ class StableDiffusionReferencePipeline:
                     and len(self.var_bank) > 0
                 ):
                     # print("hacked_mid_forward")
-                    scale_ratio = self.inpaint_mask.shape[2] / x.shape[2]
-                    this_inpaint_mask = F.interpolate(
-                        self.inpaint_mask.to(x.device), scale_factor=1 / scale_ratio
-                    )
-                    this_inpaint_mask = this_inpaint_mask.repeat(
-                        x.shape[0], x.shape[1], 1, 1
-                    ).bool()
-                    masked_x = (
-                        x[this_inpaint_mask]
-                        .detach()
-                        .clone()
-                        .view(x.shape[0], x.shape[1], -1, 1)
-                    )
-                    var, mean = torch.var_mean(
-                        masked_x, dim=(2, 3), keepdim=True, correction=0
-                    )
-                    std = torch.maximum(
-                        var, torch.zeros_like(var) + eps) ** 0.5
-                    mean_acc = sum(self.mean_bank) / float(len(self.mean_bank))
-                    var_acc = sum(self.var_bank) / float(len(self.var_bank))
-                    std_acc = (
-                        torch.maximum(var_acc, torch.zeros_like(
-                            var_acc) + eps) ** 0.5
-                    )
-                    x_uc = (((masked_x - mean) / std) * std_acc) + mean_acc
-                    x_c = x_uc.clone()
-                    if self.do_classifier_free_guidance and self.style_fidelity > 0:
-                        x_c[self.uc_mask] = masked_x[self.uc_mask]
-                    masked_x = self.style_fidelity * x_c + \
-                        (1.0 - self.style_fidelity) * x_uc
-                    x[this_inpaint_mask] = masked_x.view(-1)
                 self.mean_bank = []
                 self.var_bank = []
             return x
@@ -448,6 +550,8 @@ class StableDiffusionReferencePipeline:
                             self.ref_mask.to(hidden_states.device),
                             scale_factor=1 / scale_ratio,
                         )
                         this_ref_mask = this_ref_mask.repeat(
                             hidden_states.shape[0], hidden_states.shape[1], 1, 1
                         ).bool()
@@ -460,8 +564,8 @@ class StableDiffusionReferencePipeline:
                         var, mean = torch.var_mean(
                             masked_hidden_states, dim=(2, 3), keepdim=True, correction=0
                         )
-                        self.mean_bank0.append(mean)
-                        self.var_bank0.append(var)
                 if self.MODE == "read":
                     if (
                         self.gn_auto_machine_weight >= self.gn_weight
@@ -469,54 +573,17 @@ class StableDiffusionReferencePipeline:
                         and len(self.var_bank0) > 0
                     ):
                         # print("hacked_CrossAttnDownBlock2D_forward0")
-                        scale_ratio = self.inpaint_mask.shape[2] / \
-                            hidden_states.shape[2]
-                        this_inpaint_mask = F.interpolate(
-                            self.inpaint_mask.to(hidden_states.device), scale_factor=1 / scale_ratio
-                        )
-                        this_inpaint_mask = this_inpaint_mask.repeat(
-                            hidden_states.shape[0], hidden_states.shape[1], 1, 1
-                        ).bool()
-                        masked_hidden_states = (
-                            hidden_states[this_inpaint_mask]
-                            .detach()
-                            .clone()
-                            .view(hidden_states.shape[0], hidden_states.shape[1], -1, 1)
-                        )
-                        var, mean = torch.var_mean(
-                            masked_hidden_states, dim=(2, 3), keepdim=True, correction=0
-                        )
-                        std = torch.maximum(
-                            var, torch.zeros_like(var) + eps) ** 0.5
-                        mean_acc = sum(self.mean_bank0[i]) / float(
-                            len(self.mean_bank0[i])
-                        )
-                        var_acc = sum(
-                            self.var_bank0[i]) / float(len(self.var_bank0[i]))
-                        std_acc = (
-                            torch.maximum(
-                                var_acc, torch.zeros_like(var_acc) + eps)
-                            ** 0.5
-                        )
-                        hidden_states_uc = (
-                            ((masked_hidden_states - mean) / std) * std_acc
-                        ) + mean_acc
-                        hidden_states_c = hidden_states_uc.clone()
-                        if self.do_classifier_free_guidance and self.style_fidelity > 0:
-                            hidden_states_c[self.uc_mask] = masked_hidden_states[self.uc_mask]
-                        masked_hidden_states = (
-                            self.style_fidelity * hidden_states_c
-                            + (1.0 - self.style_fidelity) * hidden_states_uc
-                        )
-                        hidden_states[this_inpaint_mask] = masked_hidden_states.view(
-                            -1)
                 hidden_states = attn(
                     hidden_states,
                     encoder_hidden_states=encoder_hidden_states,
                     cross_attention_kwargs=cross_attention_kwargs,
-                    # attention_mask=attention_mask,
-                    # encoder_attention_mask=encoder_attention_mask,
                     return_dict=False,
                 )[0]
                 if self.MODE == "write":
@@ -528,6 +595,8 @@ class StableDiffusionReferencePipeline:
                             self.ref_mask.to(hidden_states.device),
                             scale_factor=1 / scale_ratio,
                         )
                         this_ref_mask = this_ref_mask.repeat(
                             hidden_states.shape[0], hidden_states.shape[1], 1, 1
                         ).bool()
@@ -540,8 +609,8 @@ class StableDiffusionReferencePipeline:
                         var, mean = torch.var_mean(
                             masked_hidden_states, dim=(2, 3), keepdim=True, correction=0
                         )
-                        self.mean_bank.append(mean)
-                        self.var_bank.append(var)
                 if self.MODE == "read":
                     if (
                         self.gn_auto_machine_weight >= self.gn_weight
@@ -549,48 +618,12 @@ class StableDiffusionReferencePipeline:
                         and len(self.var_bank) > 0
                     ):
                         # print("hack_CrossAttnDownBlock2D_forward")
-                        scale_ratio = self.inpaint_mask.shape[2] / \
-                            hidden_states.shape[2]
-                        this_inpaint_mask = F.interpolate(
-                            self.inpaint_mask.to(hidden_states.device), scale_factor=1 / scale_ratio
-                        )
-                        this_inpaint_mask = this_inpaint_mask.repeat(
-                            hidden_states.shape[0], hidden_states.shape[1], 1, 1
-                        ).bool()
-                        masked_hidden_states = (
-                            hidden_states[this_inpaint_mask]
-                            .detach()
-                            .clone()
-                            .view(hidden_states.shape[0], hidden_states.shape[1], -1, 1)
-                        )
-                        var, mean = torch.var_mean(
-                            masked_hidden_states, dim=(2, 3), keepdim=True, correction=0
-                        )
-                        std = torch.maximum(
-                            var, torch.zeros_like(var) + eps) ** 0.5
-                        mean_acc = sum(self.mean_bank[i]) / float(
-                            len(self.mean_bank[i])
-                        )
-                        var_acc = sum(
-                            self.var_bank[i]) / float(len(self.var_bank[i]))
-                        std_acc = (
-                            torch.maximum(
-                                var_acc, torch.zeros_like(var_acc) + eps)
-                            ** 0.5
-                        )
-                        hidden_states_uc = (
-                            ((masked_hidden_states - mean) / std) * std_acc
-                        ) + mean_acc
-                        hidden_states_c = hidden_states_uc.clone()
-                        if self.do_classifier_free_guidance and self.style_fidelity > 0:
-                            hidden_states_c[self.uc_mask] = masked_hidden_states[self.uc_mask]
-                        masked_hidden_states = (
-                            self.style_fidelity * hidden_states_c
-                            + (1.0 - self.style_fidelity) * hidden_states_uc
-                        )
-                        hidden_states[this_inpaint_mask] = masked_hidden_states.view(
-                            -1)
                 output_states = output_states + (hidden_states,)
             if self.MODE == "read":
@@ -598,6 +631,8 @@ class StableDiffusionReferencePipeline:
                 self.var_bank0 = []
                 self.mean_bank = []
                 self.var_bank = []
             if self.downsamplers is not None:
                 for downsampler in self.downsamplers:
@@ -625,6 +660,8 @@ class StableDiffusionReferencePipeline:
                             self.ref_mask.to(hidden_states.device),
                             scale_factor=1 / scale_ratio,
                         )
                         this_ref_mask = this_ref_mask.repeat(
                             hidden_states.shape[0], hidden_states.shape[1], 1, 1
                         ).bool()
@@ -637,8 +674,8 @@ class StableDiffusionReferencePipeline:
                         var, mean = torch.var_mean(
                             masked_hidden_states, dim=(2, 3), keepdim=True, correction=0
                         )
-                        self.mean_bank.append(mean)
-                        self.var_bank.append(var)
                 if self.MODE == "read":
                     if (
                         self.gn_auto_machine_weight >= self.gn_weight
@@ -646,53 +683,19 @@ class StableDiffusionReferencePipeline:
                         and len(self.var_bank) > 0
                     ):
                         # print("hacked_DownBlock2D_forward")
-                        scale_ratio = self.inpaint_mask.shape[2] / \
-                            hidden_states.shape[2]
-                        this_inpaint_mask = F.interpolate(
-                            self.inpaint_mask.to(hidden_states.device), scale_factor=1 / scale_ratio
-                        )
-                        this_inpaint_mask = this_inpaint_mask.repeat(
-                            hidden_states.shape[0], hidden_states.shape[1], 1, 1
-                        ).bool()
-                        masked_hidden_states = (
-                            hidden_states[this_inpaint_mask]
-                            .detach()
-                            .clone()
-                            .view(hidden_states.shape[0], hidden_states.shape[1], -1, 1)
-                        )
-                        var, mean = torch.var_mean(
-                            masked_hidden_states, dim=(2, 3), keepdim=True, correction=0
-                        )
-                        std = torch.maximum(
-                            var, torch.zeros_like(var) + eps) ** 0.5
-                        mean_acc = sum(self.mean_bank[i]) / float(
-                            len(self.mean_bank[i])
-                        )
-                        var_acc = sum(
-                            self.var_bank[i]) / float(len(self.var_bank[i]))
-                        std_acc = (
-                            torch.maximum(
-                                var_acc, torch.zeros_like(var_acc) + eps)
-                            ** 0.5
-                        )
-                        hidden_states_uc = (
-                            ((masked_hidden_states - mean) / std) * std_acc
-                        ) + mean_acc
-                        hidden_states_c = hidden_states_uc.clone()
-                        if self.do_classifier_free_guidance and self.style_fidelity > 0:
-                            hidden_states_c[self.uc_mask] = masked_hidden_states[self.uc_mask]
-                        masked_hidden_states = (
-                            self.style_fidelity * hidden_states_c
-                            + (1.0 - self.style_fidelity) * hidden_states_uc
-                        )
-                        hidden_states[this_inpaint_mask] = masked_hidden_states.view(
-                            -1)
                 output_states = output_states + (hidden_states,)
             if self.MODE == "read":
                 self.mean_bank = []
                 self.var_bank = []
             if self.downsamplers is not None:
                 for downsampler in self.downsamplers:
@@ -733,6 +736,8 @@ class StableDiffusionReferencePipeline:
                             self.ref_mask.to(hidden_states.device),
                             scale_factor=1 / scale_ratio,
                         )
                         this_ref_mask = this_ref_mask.repeat(
                             hidden_states.shape[0], hidden_states.shape[1], 1, 1
                         ).bool()
@@ -745,8 +750,8 @@ class StableDiffusionReferencePipeline:
                         var, mean = torch.var_mean(
                             masked_hidden_states, dim=(2, 3), keepdim=True, correction=0
                         )
-                        self.mean_bank0.append(mean)
-                        self.var_bank0.append(var)
                 if self.MODE == "read":
                     if (
                         self.gn_auto_machine_weight >= self.gn_weight
@@ -754,47 +759,12 @@ class StableDiffusionReferencePipeline:
                         and len(self.var_bank0) > 0
                     ):
                         # print("hacked_CrossAttnUpBlock2D_forward1")
-                        scale_ratio = self.inpaint_mask.shape[2] / \
-                            hidden_states.shape[2]
-                        this_inpaint_mask = F.interpolate(
-                            self.inpaint_mask.to(hidden_states.device), scale_factor=1 / scale_ratio
-                        )
-                        this_inpaint_mask = this_inpaint_mask.repeat(
-                            hidden_states.shape[0], hidden_states.shape[1], 1, 1
-                        ).bool()
-                        masked_hidden_states = (
-                            hidden_states[this_inpaint_mask]
-                            .detach()
-                            .clone()
-                            .view(hidden_states.shape[0], hidden_states.shape[1], -1, 1)
-                        )
-                        var, mean = torch.var_mean(
-                            masked_hidden_states, dim=(2, 3), keepdim=True, correction=0
-                        )
-                        std = torch.maximum(
-                            var, torch.zeros_like(var) + eps) ** 0.5
-                        mean_acc = sum(self.mean_bank0[i]) / float(
-                            len(self.mean_bank0[i])
-                        )
-                        var_acc = sum(
-                            self.var_bank0[i]) / float(len(self.var_bank0[i]))
-                        std_acc = (
-                            torch.maximum(
-                                var_acc, torch.zeros_like(var_acc) + eps)
-                            ** 0.5
-                        )
-                        hidden_states_uc = (
-                            ((masked_hidden_states - mean) / std) * std_acc
-                        ) + mean_acc
-                        hidden_states_c = hidden_states_uc.clone()
-                        if self.do_classifier_free_guidance and self.style_fidelity > 0:
-                            hidden_states_c[self.uc_mask] = masked_hidden_states[self.uc_mask]
-                        masked_hidden_states = (
-                            self.style_fidelity * hidden_states_c
-                            + (1.0 - self.style_fidelity) * hidden_states_uc
-                        )
-                        hidden_states[this_inpaint_mask] = masked_hidden_states.view(
-                            -1)
                 hidden_states = attn(
                     hidden_states,
@@ -815,6 +785,8 @@ class StableDiffusionReferencePipeline:
                             self.ref_mask.to(hidden_states.device),
                             scale_factor=1 / scale_ratio,
                         )
                         this_ref_mask = this_ref_mask.repeat(
                             hidden_states.shape[0], hidden_states.shape[1], 1, 1
                         ).bool()
@@ -827,8 +799,8 @@ class StableDiffusionReferencePipeline:
                         var, mean = torch.var_mean(
                             masked_hidden_states, dim=(2, 3), keepdim=True, correction=0
                         )
-                        self.mean_bank.append(mean)
-                        self.var_bank.append(var)
                 if self.MODE == "read":
                     if (
                         self.gn_auto_machine_weight >= self.gn_weight
@@ -836,53 +808,20 @@ class StableDiffusionReferencePipeline:
                         and len(self.var_bank) > 0
                     ):
                         # print("hacked_CrossAttnUpBlock2D_forward")
-                        scale_ratio = self.inpaint_mask.shape[2] / \
-                            hidden_states.shape[2]
-                        this_inpaint_mask = F.interpolate(
-                            self.inpaint_mask.to(hidden_states.device), scale_factor=1 / scale_ratio
-                        )
-                        this_inpaint_mask = this_inpaint_mask.repeat(
-                            hidden_states.shape[0], hidden_states.shape[1], 1, 1
-                        ).bool()
-                        masked_hidden_states = (
-                            hidden_states[this_inpaint_mask]
-                            .detach()
-                            .clone()
-                            .view(hidden_states.shape[0], hidden_states.shape[1], -1, 1)
-                        )
-                        var, mean = torch.var_mean(
-                            masked_hidden_states, dim=(2, 3), keepdim=True, correction=0
-                        )
-                        std = torch.maximum(
-                            var, torch.zeros_like(var) + eps) ** 0.5
-                        mean_acc = sum(self.mean_bank[i]) / float(
-                            len(self.mean_bank[i])
-                        )
-                        var_acc = sum(
-                            self.var_bank[i]) / float(len(self.var_bank[i]))
-                        std_acc = (
-                            torch.maximum(
-                                var_acc, torch.zeros_like(var_acc) + eps)
-                            ** 0.5
-                        )
-                        hidden_states_uc = (
-                            ((masked_hidden_states - mean) / std) * std_acc
-                        ) + mean_acc
-                        hidden_states_c = hidden_states_uc.clone()
-                        if self.do_classifier_free_guidance and self.style_fidelity > 0:
-                            hidden_states_c[self.uc_mask] = masked_hidden_states[self.uc_mask]
-                        masked_hidden_states = (
-                            self.style_fidelity * hidden_states_c
-                            + (1.0 - self.style_fidelity) * hidden_states_uc
-                        )
-                        hidden_states[this_inpaint_mask] = masked_hidden_states.view(
-                            -1)
             if self.MODE == "read":
                 self.mean_bank0 = []
                 self.var_bank0 = []
                 self.mean_bank = []
                 self.var_bank = []
             if self.upsamplers is not None:
                 for upsampler in self.upsamplers:
@@ -912,6 +851,8 @@ class StableDiffusionReferencePipeline:
                             self.ref_mask.to(hidden_states.device),
                             scale_factor=1 / scale_ratio,
                         )
                         this_ref_mask = this_ref_mask.repeat(
                             hidden_states.shape[0], hidden_states.shape[1], 1, 1
                         ).bool()
@@ -924,8 +865,8 @@ class StableDiffusionReferencePipeline:
                         var, mean = torch.var_mean(
                             masked_hidden_states, dim=(2, 3), keepdim=True, correction=0
                         )
-                        self.mean_bank.append(mean)
-                        self.var_bank.append(var)
                 if self.MODE == "read":
                     if (
                         self.gn_auto_machine_weight >= self.gn_weight
@@ -933,51 +874,17 @@ class StableDiffusionReferencePipeline:
                         and len(self.var_bank) > 0
                     ):
                         # print("hacked_UpBlock2D_forward")
-                        scale_ratio = self.inpaint_mask.shape[2] / \
-                            hidden_states.shape[2]
-                        this_inpaint_mask = F.interpolate(
-                            self.inpaint_mask.to(hidden_states.device), scale_factor=1 / scale_ratio
-                        )
-                        this_inpaint_mask = this_inpaint_mask.repeat(
-                            hidden_states.shape[0], hidden_states.shape[1], 1, 1
-                        ).bool()
-                        masked_hidden_states = (
-                            hidden_states[this_inpaint_mask]
-                            .detach()
-                            .clone()
-                            .view(hidden_states.shape[0], hidden_states.shape[1], -1, 1)
-                        )
-                        var, mean = torch.var_mean(
-                            masked_hidden_states, dim=(2, 3), keepdim=True, correction=0
-                        )
-                        std = torch.maximum(
-                            var, torch.zeros_like(var) + eps) ** 0.5
-                        mean_acc = sum(self.mean_bank[i]) / float(
-                            len(self.mean_bank[i])
-                        )
-                        var_acc = sum(
-                            self.var_bank[i]) / float(len(self.var_bank[i]))
-                        std_acc = (
-                            torch.maximum(
-                                var_acc, torch.zeros_like(var_acc) + eps)
-                            ** 0.5
-                        )
-                        hidden_states_uc = (
-                            ((masked_hidden_states - mean) / std) * std_acc
-                        ) + mean_acc
-                        hidden_states_c = hidden_states_uc.clone()
-                        if self.do_classifier_free_guidance and self.style_fidelity > 0:
-                            hidden_states_c[self.uc_mask] = masked_hidden_states[self.uc_mask]
-                        masked_hidden_states = (
-                            self.style_fidelity * hidden_states_c
-                            + (1.0 - self.style_fidelity) * hidden_states_uc
-                        )
-                        hidden_states[this_inpaint_mask] = masked_hidden_states.view(
-                            -1)
             if self.MODE == "read":
                 self.mean_bank = []
                 self.var_bank = []
             if self.upsamplers is not None:
                 for upsampler in self.upsamplers:
@@ -1003,6 +910,7 @@ class StableDiffusionReferencePipeline:
                         module, BasicTransformerBlock
                     )
                     module.bank = []
                     module.attn_weight = float(i) / float(len(attn_modules))
                     module.attention_auto_machine_weight = (
                         self.attention_auto_machine_weight
@@ -1017,6 +925,7 @@ class StableDiffusionReferencePipeline:
                     module.uc_mask = self.uc_mask
                     module.style_fidelity = self.style_fidelity
                     module.ref_mask = self.ref_mask
             else:
                 attn_modules = None
             if reference_adain:
@@ -1043,12 +952,14 @@ class StableDiffusionReferencePipeline:
                         module.forward = hacked_mid_forward.__get__(
                             module, torch.nn.Module
                         )
-                    elif isinstance(module, CrossAttnDownBlock2D):
-                        module.forward = hack_CrossAttnDownBlock2D_forward.__get__(
-                            module, CrossAttnDownBlock2D
-                        )
-                        module.mean_bank0 = []
-                        module.var_bank0 = []
                     elif isinstance(module, DownBlock2D):
                         module.forward = hacked_DownBlock2D_forward.__get__(
                             module, DownBlock2D
@@ -1057,14 +968,17 @@ class StableDiffusionReferencePipeline:
                     #     module.forward = hacked_CrossAttnUpBlock2D_forward.__get__(module, CrossAttnUpBlock2D)
                     #     module.mean_bank0 = []
                     #     module.var_bank0 = []
                     elif isinstance(module, UpBlock2D):
                         module.forward = hacked_UpBlock2D_forward.__get__(
                             module, UpBlock2D
                         )
                         module.mean_bank0 = []
                         module.var_bank0 = []
                     module.mean_bank = []
                     module.var_bank = []
                     module.attention_auto_machine_weight = (
                         self.attention_auto_machine_weight
                     )
@@ -1079,6 +993,7 @@ class StableDiffusionReferencePipeline:
                     module.style_fidelity = self.style_fidelity
                     module.ref_mask = self.ref_mask
                     module.inpaint_mask = self.inpaint_mask
             else:
                 gn_modules = None
         elif model_type == "controlnet":
@@ -1098,6 +1013,7 @@ class StableDiffusionReferencePipeline:
                         module, BasicTransformerBlock
                     )
                     module.bank = []
                     # float(i) / float(len(attn_modules))
                     module.attn_weight = 0.0
                     module.attention_auto_machine_weight = (
@@ -1113,9 +1029,61 @@ class StableDiffusionReferencePipeline:
                     module.uc_mask = self.uc_mask
                     module.style_fidelity = self.style_fidelity
                     module.ref_mask = self.ref_mask
             else:
                 attn_modules = None
-            gn_modules = None
         return attn_modules, gn_modules
@@ -1123,6 +1091,7 @@ class StableDiffusionReferencePipeline:
         if attn_modules is not None:
             for i, module in enumerate(attn_modules):
                 module.MODE = mode
         if gn_modules is not None:
             for i, module in enumerate(gn_modules):
                 module.MODE = mode

 # Based on https://raw.githubusercontent.com/okotaku/diffusers/feature/reference_only_control/examples/community/stable_diffusion_reference.py
 # Inspired by: https://github.com/Mikubill/sd-webui-controlnet/discussions/1236
+import torch.fft as fft
 from typing import Any, Callable, Dict, List, Optional, Union, Tuple
 import numpy as np
 import PIL.Image
 import torch
 from diffusers.models.attention import BasicTransformerBlock
 from diffusers.models.unet_2d_blocks import (
     CrossAttnDownBlock2D,
     DownBlock2D,
     UpBlock2D,
 )
 from diffusers.utils import PIL_INTERPOLATION, logging
 import torch.nn.functional as F
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 EXAMPLE_DOC_STRING = """
     return result
+@torch.no_grad()
+def add_freq_feature(feature1, feature2, ref_ratio):
+    """
+    feature1: reference feature
+    feature2: target feature
+    ref_ratio: larger ratio means larger reference frequency
+    """
+    # Convert features to float32 (if not already) for compatibility with fft operations
+    data_type = feature2.dtype
+    feature1 = feature1.to(torch.float32)
+    feature2 = feature2.to(torch.float32)
+    # Compute the Fourier transforms of both features
+    spectrum1 = fft.fftn(feature1, dim=(-2, -1))
+    spectrum2 = fft.fftn(feature2, dim=(-2, -1))
+    # Extract high-frequency magnitude and phase from feature1
+    magnitude1 = torch.abs(spectrum1)
+    # phase1 = torch.angle(spectrum1)
+    # Extract magnitude and phase from feature2
+    magnitude2 = torch.abs(spectrum2)
+    phase2 = torch.angle(spectrum2)
+    magnitude2.mul_((1-ref_ratio)).add_(magnitude1 * ref_ratio)
+    # phase2.mul_(1.0).add_(phase1 * 0.0)
+    # Combine magnitude and phase information
+    mixed_spectrum = torch.polar(magnitude2, phase2)
+    # Compute the inverse Fourier transform to get the mixed feature
+    mixed_feature = fft.ifftn(mixed_spectrum, dim=(-2, -1))
+    del feature1, feature2, spectrum1, spectrum2, magnitude1, magnitude2, phase2, mixed_spectrum
+    # Convert back to the original data type and return the result
+    return mixed_feature.to(data_type)
+@torch.no_grad()
+def save_ref_feature(feature, mask):
+    """
+    feature: n,c,h,w
+    mask: n,1,h,w
+    return n,c,h,w
+    """
+    return feature * mask
+@torch.no_grad()
+def mix_ref_feature(feature, ref_fea_bank, cfg=True, ref_scale=0.0, dim3=False):
+    """
+    feature: n,l,c or n,c,h,w
+    ref_fea_bank: [(n,c,h,w)]
+    cfg: True/False
+    return n,l,c or n,c,h,w
+    """
+    if cfg:
+        ref_fea = torch.cat(
+            (ref_fea_bank+ref_fea_bank), dim=0)
+    else:
+        ref_fea = ref_fea_bank
+    if dim3:
+        feature = feature.permute(0, 2, 1).view(ref_fea.shape)
+    mixed_feature = add_freq_feature(ref_fea, feature, ref_scale)
+    if dim3:
+        mixed_feature = mixed_feature.view(
+            ref_fea.shape[0], ref_fea.shape[1], -1).permute(0, 2, 1)
+    del ref_fea
+    del feature
+    return mixed_feature
+def mix_norm_feature(x, inpaint_mask, mean_bank, var_bank, do_classifier_free_guidance, style_fidelity, uc_mask, eps=1e-6):
+    """
+    x: input feature n,c,h,w
+    inpaint_mask: mask region to inpain
+    """
+    # get the inpainting region and only mix this region.
+    scale_ratio = inpaint_mask.shape[2] / x.shape[2]
+    this_inpaint_mask = F.interpolate(
+        inpaint_mask.to(x.device), scale_factor=1 / scale_ratio
+    )
+    this_inpaint_mask = this_inpaint_mask.repeat(
+        x.shape[0], x.shape[1], 1, 1
+    ).bool()
+    masked_x = (
+        x[this_inpaint_mask]
+        .detach()
+        .clone()
+        .view(x.shape[0], x.shape[1], -1, 1)
+    )
+    var, mean = torch.var_mean(
+        masked_x, dim=(2, 3), keepdim=True, correction=0
+    )
+    std = torch.maximum(
+        var, torch.zeros_like(var) + eps) ** 0.5
+    mean_acc = sum(mean_bank) / float(len(mean_bank))
+    var_acc = sum(var_bank) / float(len(var_bank))
+    std_acc = (
+        torch.maximum(var_acc, torch.zeros_like(
+            var_acc) + eps) ** 0.5
+    )
+    x_uc = (((masked_x - mean) / std) * std_acc) + mean_acc
+    x_c = x_uc.clone()
+    if do_classifier_free_guidance and style_fidelity > 0:
+        x_c[uc_mask] = masked_x[uc_mask]
+    masked_x = style_fidelity * x_c + \
+        (1.0 - style_fidelity) * x_uc
+    x[this_inpaint_mask] = masked_x.view(-1)
+    return x
 class StableDiffusionReferencePipeline:
     def prepare_ref_image(
         self,
                         this_ref_mask = F.interpolate(
                             this_ref_mask, scale_factor=ref_scale
                         )
+                        self.fea_bank.append(save_ref_feature(
+                            resize_norm_hidden_states, this_ref_mask))
                         this_ref_mask = this_ref_mask.repeat(
                             resize_norm_hidden_states.shape[0],
                             resize_norm_hidden_states.shape[1],
                                 -1,
                             )
                         )
                         masked_norm_hidden_states = masked_norm_hidden_states.permute(
                             0, 2, 1
                         )
                         self.bank.append(masked_norm_hidden_states)
+                        del masked_norm_hidden_states
+                        del this_ref_mask
+                        del resize_norm_hidden_states
                     attn_output = self.attn1(
                         norm_hidden_states,
                         encoder_hidden_states=encoder_hidden_states
                     )
                 if self.MODE == "read":
                     if self.attention_auto_machine_weight > self.attn_weight:
+                        freq_norm_hidden_states = mix_ref_feature(
+                            norm_hidden_states,
+                            self.fea_bank,
+                            cfg=self.do_classifier_free_guidance,
+                            ref_scale=self.ref_scale,
+                            dim3=True)
+                        self.fea_bank.clear()
+                        this_bank = torch.cat(self.bank+self.bank, dim=0)
                         ref_hidden_states = torch.cat(
+                            (freq_norm_hidden_states, this_bank), dim=1
                         )
+                        del this_bank
+                        self.bank.clear()
                         attn_output_uc = self.attn1(
+                            freq_norm_hidden_states,
                             encoder_hidden_states=ref_hidden_states,
                             **cross_attention_kwargs,
                         )
+                        del ref_hidden_states
                         attn_output_c = attn_output_uc.clone()
                         if self.do_classifier_free_guidance and self.style_fidelity > 0:
                             attn_output_c[self.uc_mask] = self.attn1(
                             + (1.0 - self.style_fidelity) * attn_output_uc
                         )
                         self.bank.clear()
+                        self.fea_bank.clear()
+                        del attn_output_c
+                        del attn_output_uc
                     else:
                         attn_output = self.attn1(
                             norm_hidden_states,
                             attention_mask=attention_mask,
                             **cross_attention_kwargs,
                         )
+                    self.bank.clear()
+                    self.fea_bank.clear()
             if self.use_ada_layer_norm_zero:
                 attn_output = gate_msa.unsqueeze(1) * attn_output
             hidden_states = attn_output + hidden_states
                     this_ref_mask = F.interpolate(
                         self.ref_mask.to(x.device), scale_factor=1 / scale_ratio
                     )
+                    self.fea_bank.append(save_ref_feature(
+                        x, this_ref_mask))
                     this_ref_mask = this_ref_mask.repeat(
                         x.shape[0], x.shape[1], 1, 1
                     ).bool()
                         masked_x, dim=(2, 3), keepdim=True, correction=0
                     )
+                    self.mean_bank.append(torch.cat([mean]*2, dim=0))
+                    self.var_bank.append(torch.cat([var]*2, dim=0))
             if self.MODE == "read":
                 if (
                     self.gn_auto_machine_weight >= self.gn_weight
                     and len(self.var_bank) > 0
                 ):
                     # print("hacked_mid_forward")
+                    x = mix_ref_feature(
+                        x, self.fea_bank, cfg=self.do_classifier_free_guidance, ref_scale=self.ref_scale)
+                    self.fea_bank = []
+                    x = mix_norm_feature(x, self.inpaint_mask, self.mean_bank, self.var_bank,
+                                         self.do_classifier_free_guidance,
+                                         self.style_fidelity, self.uc_mask)
                 self.mean_bank = []
                 self.var_bank = []
             return x
                             self.ref_mask.to(hidden_states.device),
                             scale_factor=1 / scale_ratio,
                         )
+                        self.fea_bank0.append(save_ref_feature(
+                            hidden_states, this_ref_mask))
                         this_ref_mask = this_ref_mask.repeat(
                             hidden_states.shape[0], hidden_states.shape[1], 1, 1
                         ).bool()
                         var, mean = torch.var_mean(
                             masked_hidden_states, dim=(2, 3), keepdim=True, correction=0
                         )
+                        self.mean_bank0.append(torch.cat([mean]*2, dim=0))
+                        self.var_bank0.append(torch.cat([var]*2, dim=0))
                 if self.MODE == "read":
                     if (
                         self.gn_auto_machine_weight >= self.gn_weight
                         and len(self.var_bank0) > 0
                     ):
                         # print("hacked_CrossAttnDownBlock2D_forward0")
+                        hidden_states = mix_ref_feature(
+                            hidden_states, [self.fea_bank0[i]], cfg=self.do_classifier_free_guidance, ref_scale=self.ref_scale)
+                        hidden_states = mix_norm_feature(hidden_states, self.inpaint_mask, self.mean_bank0[i], self.var_bank0[i],
+                                                         self.do_classifier_free_guidance,
+                                                         self.style_fidelity, self.uc_mask)
                 hidden_states = attn(
                     hidden_states,
                     encoder_hidden_states=encoder_hidden_states,
                     cross_attention_kwargs=cross_attention_kwargs,
                     return_dict=False,
                 )[0]
                 if self.MODE == "write":
                             self.ref_mask.to(hidden_states.device),
                             scale_factor=1 / scale_ratio,
                         )
+                        self.fea_bank.append(save_ref_feature(
+                            hidden_states, this_ref_mask))
                         this_ref_mask = this_ref_mask.repeat(
                             hidden_states.shape[0], hidden_states.shape[1], 1, 1
                         ).bool()
                         var, mean = torch.var_mean(
                             masked_hidden_states, dim=(2, 3), keepdim=True, correction=0
                         )
+                        self.mean_bank.append(torch.cat([mean]*2, dim=0))
+                        self.var_bank.append(torch.cat([var]*2, dim=0))
                 if self.MODE == "read":
                     if (
                         self.gn_auto_machine_weight >= self.gn_weight
                         and len(self.var_bank) > 0
                     ):
                         # print("hack_CrossAttnDownBlock2D_forward")
+                        hidden_states = mix_ref_feature(
+                            hidden_states, [self.fea_bank[i]], cfg=self.do_classifier_free_guidance, ref_scale=self.ref_scale)
+                        hidden_states = mix_norm_feature(hidden_states, self.inpaint_mask, self.mean_bank[i], self.var_bank[i],
+                                                         self.do_classifier_free_guidance,
+                                                         self.style_fidelity, self.uc_mask)
                 output_states = output_states + (hidden_states,)
             if self.MODE == "read":
                 self.var_bank0 = []
                 self.mean_bank = []
                 self.var_bank = []
+                self.fea_bank0 = []
+                self.fea_bank = []
             if self.downsamplers is not None:
                 for downsampler in self.downsamplers:
                             self.ref_mask.to(hidden_states.device),
                             scale_factor=1 / scale_ratio,
                         )
+                        self.fea_bank.append(save_ref_feature(
+                            hidden_states, this_ref_mask))
                         this_ref_mask = this_ref_mask.repeat(
                             hidden_states.shape[0], hidden_states.shape[1], 1, 1
                         ).bool()
                         var, mean = torch.var_mean(
                             masked_hidden_states, dim=(2, 3), keepdim=True, correction=0
                         )
+                        self.mean_bank.append(torch.cat([mean]*2, dim=0))
+                        self.var_bank.append(torch.cat([var]*2, dim=0))
                 if self.MODE == "read":
                     if (
                         self.gn_auto_machine_weight >= self.gn_weight
                         and len(self.var_bank) > 0
                     ):
                         # print("hacked_DownBlock2D_forward")
+                        hidden_states = mix_ref_feature(
+                            hidden_states, [self.fea_bank[i]], cfg=self.do_classifier_free_guidance, ref_scale=self.ref_scale)
+                        hidden_states = mix_norm_feature(hidden_states, self.inpaint_mask, self.mean_bank[i], self.var_bank[i],
+                                                         self.do_classifier_free_guidance,
+                                                         self.style_fidelity, self.uc_mask)
                 output_states = output_states + (hidden_states,)
             if self.MODE == "read":
                 self.mean_bank = []
                 self.var_bank = []
+                self.fea_bank = []
             if self.downsamplers is not None:
                 for downsampler in self.downsamplers:
                             self.ref_mask.to(hidden_states.device),
                             scale_factor=1 / scale_ratio,
                         )
+                        self.fea_bank0.append(save_ref_feature(
+                            hidden_states, this_ref_mask))
                         this_ref_mask = this_ref_mask.repeat(
                             hidden_states.shape[0], hidden_states.shape[1], 1, 1
                         ).bool()
                         var, mean = torch.var_mean(
                             masked_hidden_states, dim=(2, 3), keepdim=True, correction=0
                         )
+                        self.mean_bank0.append(torch.cat([mean]*2, dim=0))
+                        self.var_bank0.append(torch.cat([var]*2, dim=0))
                 if self.MODE == "read":
                     if (
                         self.gn_auto_machine_weight >= self.gn_weight
                         and len(self.var_bank0) > 0
                     ):
                         # print("hacked_CrossAttnUpBlock2D_forward1")
+                        hidden_states = mix_ref_feature(
+                            hidden_states, [self.fea_bank0[i]], cfg=self.do_classifier_free_guidance, ref_scale=self.ref_scale)
+                        hidden_states = mix_norm_feature(hidden_states, self.inpaint_mask, self.mean_bank0[i], self.var_bank0[i],
+                                                         self.do_classifier_free_guidance,
+                                                         self.style_fidelity, self.uc_mask)
                 hidden_states = attn(
                     hidden_states,
                             self.ref_mask.to(hidden_states.device),
                             scale_factor=1 / scale_ratio,
                         )
+                        self.fea_bank.append(save_ref_feature(
+                            hidden_states, this_ref_mask))
                         this_ref_mask = this_ref_mask.repeat(
                             hidden_states.shape[0], hidden_states.shape[1], 1, 1
                         ).bool()
                         var, mean = torch.var_mean(
                             masked_hidden_states, dim=(2, 3), keepdim=True, correction=0
                         )
+                        self.mean_bank.append(torch.cat([mean]*2, dim=0))
+                        self.var_bank.append(torch.cat([var]*2, dim=0))
                 if self.MODE == "read":
                     if (
                         self.gn_auto_machine_weight >= self.gn_weight
                         and len(self.var_bank) > 0
                     ):
                         # print("hacked_CrossAttnUpBlock2D_forward")
+                        hidden_states = mix_ref_feature(
+                            hidden_states, [self.fea_bank[i]], cfg=self.do_classifier_free_guidance, ref_scale=self.ref_scale)
+                        hidden_states = mix_norm_feature(hidden_states, self.inpaint_mask, self.mean_bank[i], self.var_bank[i],
+                                                         self.do_classifier_free_guidance,
+                                                         self.style_fidelity, self.uc_mask)
             if self.MODE == "read":
                 self.mean_bank0 = []
                 self.var_bank0 = []
                 self.mean_bank = []
                 self.var_bank = []
+                self.fea_bank = []
+                self.fea_bank0 = []
             if self.upsamplers is not None:
                 for upsampler in self.upsamplers:
                             self.ref_mask.to(hidden_states.device),
                             scale_factor=1 / scale_ratio,
                         )
+                        self.fea_bank.append(save_ref_feature(
+                            hidden_states, this_ref_mask))
                         this_ref_mask = this_ref_mask.repeat(
                             hidden_states.shape[0], hidden_states.shape[1], 1, 1
                         ).bool()
                         var, mean = torch.var_mean(
                             masked_hidden_states, dim=(2, 3), keepdim=True, correction=0
                         )
+                        self.mean_bank.append(torch.cat([mean]*2, dim=0))
+                        self.var_bank.append(torch.cat([var]*2, dim=0))
                 if self.MODE == "read":
                     if (
                         self.gn_auto_machine_weight >= self.gn_weight
                         and len(self.var_bank) > 0
                     ):
                         # print("hacked_UpBlock2D_forward")
+                        hidden_states = mix_ref_feature(
+                            hidden_states, [self.fea_bank[i]], cfg=self.do_classifier_free_guidance, ref_scale=self.ref_scale)
+                        hidden_states = mix_norm_feature(hidden_states, self.inpaint_mask, self.mean_bank[i], self.var_bank[i],
+                                                         self.do_classifier_free_guidance,
+                                                         self.style_fidelity, self.uc_mask)
             if self.MODE == "read":
                 self.mean_bank = []
                 self.var_bank = []
+                self.fea_bank = []
             if self.upsamplers is not None:
                 for upsampler in self.upsamplers:
                         module, BasicTransformerBlock
                     )
                     module.bank = []
+                    module.fea_bank = []
                     module.attn_weight = float(i) / float(len(attn_modules))
                     module.attention_auto_machine_weight = (
                         self.attention_auto_machine_weight
                     module.uc_mask = self.uc_mask
                     module.style_fidelity = self.style_fidelity
                     module.ref_mask = self.ref_mask
+                    module.ref_scale = self.ref_scale
             else:
                 attn_modules = None
             if reference_adain:
                         module.forward = hacked_mid_forward.__get__(
                             module, torch.nn.Module
                         )
+                    # elif isinstance(module, CrossAttnDownBlock2D):
+                    #     module.forward = hack_CrossAttnDownBlock2D_forward.__get__(
+                    #         module, CrossAttnDownBlock2D
+                    #     )
+                    #     module.mean_bank0 = []
+                    #     module.var_bank0 = []
+                    #     module.fea_bank0 = []
                     elif isinstance(module, DownBlock2D):
                         module.forward = hacked_DownBlock2D_forward.__get__(
                             module, DownBlock2D
                     #     module.forward = hacked_CrossAttnUpBlock2D_forward.__get__(module, CrossAttnUpBlock2D)
                     #     module.mean_bank0 = []
                     #     module.var_bank0 = []
+                    #     module.fea_bank0 = []
                     elif isinstance(module, UpBlock2D):
                         module.forward = hacked_UpBlock2D_forward.__get__(
                             module, UpBlock2D
                         )
                         module.mean_bank0 = []
                         module.var_bank0 = []
+                        module.fea_bank0 = []
                     module.mean_bank = []
                     module.var_bank = []
+                    module.fea_bank = []
                     module.attention_auto_machine_weight = (
                         self.attention_auto_machine_weight
                     )
                     module.style_fidelity = self.style_fidelity
                     module.ref_mask = self.ref_mask
                     module.inpaint_mask = self.inpaint_mask
+                    module.ref_scale = self.ref_scale
             else:
                 gn_modules = None
         elif model_type == "controlnet":
                         module, BasicTransformerBlock
                     )
                     module.bank = []
+                    module.fea_bank = []
                     # float(i) / float(len(attn_modules))
                     module.attn_weight = 0.0
                     module.attention_auto_machine_weight = (
                     module.uc_mask = self.uc_mask
                     module.style_fidelity = self.style_fidelity
                     module.ref_mask = self.ref_mask
+                    module.ref_scale = self.ref_scale
             else:
                 attn_modules = None
+            # gn_modules = None
+            if reference_adain:
+                gn_modules = [model.mid_block]
+                model.mid_block.gn_weight = 0
+                down_blocks = model.down_blocks
+                for w, module in enumerate(down_blocks):
+                    module.gn_weight = 1.0 - float(w) / float(len(down_blocks))
+                    gn_modules.append(module)
+                    # print(module.__class__.__name__,module.gn_weight)
+                for i, module in enumerate(gn_modules):
+                    if getattr(module, "original_forward", None) is None:
+                        module.original_forward = module.forward
+                    if i == 0:
+                        # mid_block
+                        module.forward = hacked_mid_forward.__get__(
+                            module, torch.nn.Module
+                        )
+                    # elif isinstance(module, CrossAttnDownBlock2D):
+                    #     module.forward = hack_CrossAttnDownBlock2D_forward.__get__(
+                    #         module, CrossAttnDownBlock2D
+                    #     )
+                    #     module.mean_bank0 = []
+                    #     module.var_bank0 = []
+                    #     module.fea_bank0 = []
+                    elif isinstance(module, DownBlock2D):
+                        module.forward = hacked_DownBlock2D_forward.__get__(
+                            module, DownBlock2D
+                        )
+                    module.mean_bank = []
+                    module.var_bank = []
+                    module.fea_bank = []
+                    module.attention_auto_machine_weight = (
+                        self.attention_auto_machine_weight
+                    )
+                    module.gn_auto_machine_weight = self.gn_auto_machine_weight
+                    module.do_classifier_free_guidance = (
+                        self.do_classifier_free_guidance
+                    )
+                    module.do_classifier_free_guidance = (
+                        self.do_classifier_free_guidance
+                    )
+                    module.uc_mask = self.uc_mask
+                    module.style_fidelity = self.style_fidelity
+                    module.ref_mask = self.ref_mask
+                    module.inpaint_mask = self.inpaint_mask
+                    module.ref_scale = self.ref_scale
+            else:
+                gn_modules = None
         return attn_modules, gn_modules
         if attn_modules is not None:
             for i, module in enumerate(attn_modules):
                 module.MODE = mode
         if gn_modules is not None:
             for i, module in enumerate(gn_modules):
                 module.MODE = mode