EditAnything

Runtime error

App Files Files Community

shgao commited on May 6, 2023

Commit

f14200d

•

1 Parent(s): ecdaa2c

update new demo

Browse files

Files changed (6) hide show

app.py +41 -6
sam2edit.py +79 -315
sam2edit_beauty.py +95 -0
sam2edit_handsome.py +90 -0
sam2edit_lora.py +478 -0
utils/stable_diffusion_controlnet_inpaint.py +172 -88

app.py CHANGED Viewed

@@ -1,9 +1,12 @@
 import gradio as gr
 from sam2edit import create_demo as create_demo_edit_anything
-# from sam2image import create_demo as create_demo_generate_anything
 DESCRIPTION = f'''# [Edit Anything](https://github.com/sail-sg/EditAnything)
 **Edit anything and keep the layout by segmenting anything in the image.**
@@ -12,13 +15,45 @@ SHARED_UI_WARNING = f'''### [NOTE]  Inference may be slow in this shared UI.
 You can duplicate and use it with a paid private GPU.
 <a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/jyseo/3DFuse?duplicate=true"><img style="margin-top:0;margin-bottom:0" src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-xl-dark.svg" alt="Duplicate Space"></a>
 '''
 with gr.Blocks() as demo:
     gr.Markdown(DESCRIPTION)
-    gr.Markdown(SHARED_UI_WARNING)
     with gr.Tabs():
-        with gr.TabItem('Edit Anything'):
-            create_demo_edit_anything()
         # with gr.TabItem('Generate Anything'):
         #     create_demo_generate_anything()
 demo.queue(api_open=False).launch()

 import gradio as gr
+import os
 from sam2edit import create_demo as create_demo_edit_anything
+from sam2image import create_demo as create_demo_generate_anything
+from sam2edit_beauty import create_demo as create_demo_beauty
+from sam2edit_handsome import create_demo as create_demo_handsome
+from sam2edit_lora import EditAnythingLoraModel, init_sam_model, init_blip_processor, init_blip_model
+from huggingface_hub import hf_hub_download, snapshot_download
 DESCRIPTION = f'''# [Edit Anything](https://github.com/sail-sg/EditAnything)
 **Edit anything and keep the layout by segmenting anything in the image.**
 You can duplicate and use it with a paid private GPU.
 <a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/jyseo/3DFuse?duplicate=true"><img style="margin-top:0;margin-bottom:0" src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-xl-dark.svg" alt="Duplicate Space"></a>
 '''
+#
+sam_generator = init_sam_model()
+blip_processor = init_blip_processor()
+blip_model = init_blip_model()
+sd_models_path = snapshot_download("shgao/sdmodels")
 with gr.Blocks() as demo:
     gr.Markdown(DESCRIPTION)
     with gr.Tabs():
+        with gr.TabItem('🖌Edit Anything'):
+            model = EditAnythingLoraModel(base_model_path="stabilityai/stable-diffusion-2-inpainting",
+                                          controlmodel_name='LAION Pretrained(v0-4)-SD21',
+                                          lora_model_path=None, use_blip=True, extra_inpaint=False,
+                                          sam_generator=sam_generator,
+                                          blip_processor=blip_processor,
+                                          blip_model=blip_model)
+            create_demo_edit_anything(model.process)
+        with gr.TabItem(' 👩‍🦰Beauty Edit/Generation'):
+            lora_model_path = hf_hub_download(
+                "mlida/Cute_girl_mix4", "cuteGirlMix4_v10.safetensors")
+            model = EditAnythingLoraModel(base_model_path=os.path.join(sd_models_path, "chilloutmix_NiPrunedFp32Fix"),
+                                          lora_model_path=lora_model_path, use_blip=True, extra_inpaint=True,
+                                          sam_generator=sam_generator,
+                                          blip_processor=blip_processor,
+                                          blip_model=blip_model
+                                          )
+            create_demo_beauty(model.process)
+        with gr.TabItem(' 👨‍🌾Handsome Edit/Generation'):
+            model = EditAnythingLoraModel(base_model_path=os.path.join(sd_models_path, "Realistic_Vision_V2.0"),
+                                          lora_model_path=None, use_blip=True, extra_inpaint=True,
+                                          sam_generator=sam_generator,
+                                          blip_processor=blip_processor,
+                                          blip_model=blip_model)
+            create_demo_handsome(model.process)
         # with gr.TabItem('Generate Anything'):
         #     create_demo_generate_anything()
+    with gr.Tabs():
+        gr.Markdown(SHARED_UI_WARNING)
 demo.queue(api_open=False).launch()

sam2edit.py CHANGED Viewed

@@ -1,321 +1,85 @@
 # Edit Anything trained with Stable Diffusion + ControlNet + SAM  + BLIP2
-from torchvision.utils import save_image
-from PIL import Image
-from pytorch_lightning import seed_everything
-import subprocess
-from collections import OrderedDict
-import cv2
-import einops
 import gradio as gr
-import numpy as np
-import torch
-import random
-import os
-import requests
-from io import BytesIO
-from annotator.util import resize_image, HWC3
-def create_demo():
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    use_blip = True
-    use_gradio = True
-    # Diffusion init using diffusers.
-    # diffusers==0.14.0 required.
-    from diffusers import StableDiffusionInpaintPipeline
-    from diffusers import ControlNetModel, UniPCMultistepScheduler
-    from utils.stable_diffusion_controlnet_inpaint import StableDiffusionControlNetInpaintPipeline
-    from diffusers.utils import load_image
-    base_model_path = "stabilityai/stable-diffusion-2-inpainting"
-    config_dict = OrderedDict([('SAM Pretrained(v0-1): Good Natural Sense', 'shgao/edit-anything-v0-1-1'),
-                            ('LAION Pretrained(v0-3): Good Face', 'shgao/edit-anything-v0-3'),
-                            ('SD Inpainting: Not keep position', 'stabilityai/stable-diffusion-2-inpainting')
-                            ])
-    def obtain_generation_model(controlnet_path):
-        if controlnet_path=='stabilityai/stable-diffusion-2-inpainting':
-            pipe = StableDiffusionInpaintPipeline.from_pretrained(
-                "stabilityai/stable-diffusion-2-inpainting",
-                torch_dtype=torch.float16,
-            )
-        else:
-            controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
-            pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
-                base_model_path, controlnet=controlnet, torch_dtype=torch.float16
-            )
-        # speed up diffusion process with faster scheduler and memory optimization
-        pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-        # remove following line if xformers is not installed
-        pipe.enable_xformers_memory_efficient_attention()
-        pipe.enable_model_cpu_offload() # disable for now because of unknow bug in accelerate
-        # pipe.to(device)
-        return pipe
-    global default_controlnet_path
-    global pipe
-    default_controlnet_path = config_dict['LAION Pretrained(v0-3): Good Face']
-    pipe = obtain_generation_model(default_controlnet_path)
-    # Segment-Anything init.
-    # pip install git+https://github.com/facebookresearch/segment-anything.git
-    try:
-        from segment_anything import sam_model_registry, SamAutomaticMaskGenerator
-    except ImportError:
-        print('segment_anything not installed')
-        result = subprocess.run(['pip', 'install', 'git+https://github.com/facebookresearch/segment-anything.git'], check=True)
-        print(f'Install segment_anything {result}')
-        from segment_anything import sam_model_registry, SamAutomaticMaskGenerator
-    if not os.path.exists('./models/sam_vit_h_4b8939.pth'):
-        result = subprocess.run(['wget', 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth', '-P', 'models'], check=True)
-        print(f'Download sam_vit_h_4b8939.pth {result}')
-    sam_checkpoint = "models/sam_vit_h_4b8939.pth"
-    model_type = "default"
-    sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
-    sam.to(device=device)
-    mask_generator = SamAutomaticMaskGenerator(sam)
-    # BLIP2 init.
-    if use_blip:
-        # need the latest transformers
-        # pip install git+https://github.com/huggingface/transformers.git
-        from transformers import AutoProcessor, Blip2ForConditionalGeneration
-        processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
-        blip_model = Blip2ForConditionalGeneration.from_pretrained(
-            "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16, device_map="auto")
-    def get_blip2_text(image):
-        inputs = processor(image, return_tensors="pt").to(device, torch.float16)
-        generated_ids = blip_model.generate(**inputs, max_new_tokens=50)
-        generated_text = processor.batch_decode(
-            generated_ids, skip_special_tokens=True)[0].strip()
-        return generated_text
-    def show_anns(anns):
-        if len(anns) == 0:
-            return
-        sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
-        full_img = None
-        # for ann in sorted_anns:
-        for i in range(len(sorted_anns)):
-            ann = anns[i]
-            m = ann['segmentation']
-            if full_img is None:
-                full_img = np.zeros((m.shape[0], m.shape[1], 3))
-                map = np.zeros((m.shape[0], m.shape[1]), dtype=np.uint16)
-            map[m != 0] = i + 1
-            color_mask = np.random.random((1, 3)).tolist()[0]
-            full_img[m != 0] = color_mask
-        full_img = full_img*255
-        # anno encoding from https://github.com/LUSSeg/ImageNet-S
-        res = np.zeros((map.shape[0], map.shape[1], 3))
-        res[:, :, 0] = map % 256
-        res[:, :, 1] = map // 256
-        res.astype(np.float32)
-        full_img = Image.fromarray(np.uint8(full_img))
-        return full_img, res
-    def get_sam_control(image):
-        masks = mask_generator.generate(image)
-        full_img, res = show_anns(masks)
-        return full_img, res
-    def process(condition_model, source_image, enable_all_generate, mask_image, control_scale, enable_auto_prompt, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta):
-        input_image = source_image["image"]
-        if mask_image is None:
-            if enable_all_generate:
-                print("source_image", source_image["mask"].shape, input_image.shape,)
-                print(source_image["mask"].max())
-                mask_image = np.ones((input_image.shape[0], input_image.shape[1], 3))*255
-            else:
-                mask_image = source_image["mask"]
-        global default_controlnet_path
-        print("To Use:", config_dict[condition_model], "Current:", default_controlnet_path)
-        if default_controlnet_path!=config_dict[condition_model]:
-            print("Change condition model to:", config_dict[condition_model])
-            global pipe
-            pipe = obtain_generation_model(config_dict[condition_model])
-            default_controlnet_path = config_dict[condition_model]
-            torch.cuda.empty_cache()
-        with torch.no_grad():
-            if use_blip and (enable_auto_prompt or len(prompt) == 0):
-                print("Generating text:")
-                blip2_prompt = get_blip2_text(input_image)
-                print("Generated text:", blip2_prompt)
-                if len(prompt)>0:
-                    prompt = blip2_prompt + ',' + prompt
-                else:
-                    prompt = blip2_prompt
-                print("All text:", prompt)
-            input_image = HWC3(input_image)
-            img = resize_image(input_image, image_resolution)
-            H, W, C = img.shape
-            print("Generating SAM seg:")
-            # the default SAM model is trained with 1024 size.
-            full_segmask, detected_map = get_sam_control(
-                resize_image(input_image, detect_resolution))
-            detected_map = HWC3(detected_map.astype(np.uint8))
-            detected_map = cv2.resize(
-                detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
-            control = torch.from_numpy(
-                detected_map.copy()).float().cuda()
-            control = torch.stack([control for _ in range(num_samples)], dim=0)
-            control = einops.rearrange(control, 'b h w c -> b c h w').clone()
-            mask_image = HWC3(mask_image.astype(np.uint8))
-            mask_image = cv2.resize(
-                mask_image, (W, H), interpolation=cv2.INTER_LINEAR)
-            mask_image = Image.fromarray(mask_image)
-            if seed == -1:
-                seed = random.randint(0, 65535)
-            seed_everything(seed)
-            generator = torch.manual_seed(seed)
-            if condition_model=='SD Inpainting: Not keep position':
-                x_samples = pipe(
-                    image=img,
-                    mask_image=mask_image,
-                    prompt=[prompt + ', ' + a_prompt] * num_samples,
-                    negative_prompt=[n_prompt] * num_samples,
-                    num_images_per_prompt=num_samples,
-                    num_inference_steps=ddim_steps,
-                    generator=generator,
-                    height=H,
-                    width=W,
-                ).images
-            else:
-                x_samples = pipe(
-                    image=img,
-                    mask_image=mask_image,
-                    prompt=[prompt + ', ' + a_prompt] * num_samples,
-                    negative_prompt=[n_prompt] * num_samples,
-                    num_images_per_prompt=num_samples,
-                    num_inference_steps=ddim_steps,
-                    generator=generator,
-                    controlnet_conditioning_image=control.type(torch.float16),
-                    height=H,
-                    width=W,
-                    controlnet_conditioning_scale=control_scale,
-                ).images
-            results = [x_samples[i] for i in range(num_samples)]
-        return [full_segmask, mask_image] + results, prompt
-    def download_image(url):
-        response = requests.get(url)
-        return Image.open(BytesIO(response.content)).convert("RGB")
-    # disable gradio when not using GUI.
-    if not use_gradio:
-        # This part is not updated, it's just a example to use it without GUI.
-        image_path = "../data/samples/sa_223750.jpg"
-        mask_path = "../data/samples/sa_223750inpaint.png"
-        input_image = Image.open(image_path)
-        mask_image = Image.open(mask_path)
-        enable_auto_prompt = True
-        input_image = np.array(input_image, dtype=np.uint8)
-        mask_image = np.array(mask_image, dtype=np.uint8)
-        prompt = "esplendent sunset sky, red brick wall"
-        a_prompt = 'best quality, extremely detailed'
-        n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
-        num_samples = 3
-        image_resolution = 512
-        detect_resolution = 512
-        ddim_steps = 30
-        guess_mode = False
-        strength = 1.0
-        scale = 9.0
-        seed = -1
-        eta = 0.0
-        outputs = process(condition_model, input_image, mask_image, enable_auto_prompt, prompt, a_prompt, n_prompt, num_samples, image_resolution,
-                        detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta)
-        image_list = []
-        input_image = resize_image(input_image, 512)
-        image_list.append(torch.tensor(input_image))
-        for i in range(len(outputs)):
-            each = outputs[i]
-            if type(each) is not np.ndarray:
-                each = np.array(each, dtype=np.uint8)
-            each = resize_image(each, 512)
-            print(i, each.shape)
-            image_list.append(torch.tensor(each))
-        image_list = torch.stack(image_list).permute(0, 3, 1, 2)
-        save_image(image_list, "sample.jpg", nrow=3,
-                normalize=True, value_range=(0, 255))
-    else:
-        print("The GUI is not fully tested yet. Please open an issue if you find bugs.")
-        block = gr.Blocks()
-        with block as demo:
-            with gr.Row():
-                gr.Markdown(
-                    "## Edit Anything")
-            with gr.Row():
-                with gr.Column():
-                    source_image = gr.Image(source='upload',label="Image (Upload an image and cover the region you want to edit with sketch)",  type="numpy", tool="sketch")
-                    enable_all_generate = gr.Checkbox(label='Auto generation on all region.', value=False)
-                    prompt = gr.Textbox(label="Prompt (Text in the expected things of edited region)")
-                    enable_auto_prompt = gr.Checkbox(label='Auto generate text prompt from input image with BLIP2: Warning: Enable this may makes your prompt not working.', value=True)
-                    control_scale = gr.Slider(
-                            label="Mask Align strength (Large value means more strict alignment with SAM mask)", minimum=0, maximum=1, value=1, step=0.1)
-                    run_button = gr.Button(label="Run")
                     condition_model = gr.Dropdown(choices=list(config_dict.keys()),
-                                                value=list(config_dict.keys())[1],
-                                                label='Model',
-                                                multiselect=False)
-                    num_samples = gr.Slider(
-                            label="Images", minimum=1, maximum=12, value=2, step=1)
-                    with gr.Accordion("Advanced options", open=False):
-                        mask_image = gr.Image(source='upload', label="(Optional) Upload a predefined mask of edit region if you do not want to write your prompt.", type="numpy", value=None)
-                        image_resolution = gr.Slider(
-                            label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
-                        strength = gr.Slider(
-                            label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
-                        guess_mode = gr.Checkbox(label='Guess Mode', value=False)
-                        detect_resolution = gr.Slider(
-                            label="SAM Resolution", minimum=128, maximum=2048, value=1024, step=1)
-                        ddim_steps = gr.Slider(
-                            label="Steps", minimum=1, maximum=100, value=30, step=1)
-                        scale = gr.Slider(
-                            label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
-                        seed = gr.Slider(label="Seed", minimum=-1,
-                                        maximum=2147483647, step=1, randomize=True)
-                        eta = gr.Number(label="eta (DDIM)", value=0.0)
-                        a_prompt = gr.Textbox(
-                            label="Added Prompt", value='best quality, extremely detailed')
-                        n_prompt = gr.Textbox(label="Negative Prompt",
-                                            value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
-                with gr.Column():
-                    result_gallery = gr.Gallery(
-                        label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
-                    result_text = gr.Text(label='BLIP2+Human Prompt Text')
-            ips = [condition_model, source_image, enable_all_generate, mask_image, control_scale, enable_auto_prompt, prompt, a_prompt, n_prompt, num_samples, image_resolution,
-                detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta]
-            run_button.click(fn=process, inputs=ips, outputs=[result_gallery, result_text])
-        return demo
 if __name__ == '__main__':
-    demo = create_demo()
     demo.queue().launch(server_name='0.0.0.0')

 # Edit Anything trained with Stable Diffusion + ControlNet + SAM  + BLIP2
 import gradio as gr
+from diffusers.utils import load_image
+from sam2edit_lora import EditAnythingLoraModel, config_dict
+def create_demo(process):
+    print("The GUI is not fully tested yet. Please open an issue if you find bugs.")
+    WARNING_INFO = f'''### [NOTE]  the model is collected from the Internet for demo only, please do not use it for commercial purposes.
+    We are not responsible for possible risks using this model.
+    '''
+    block = gr.Blocks()
+    with block as demo:
+        with gr.Row():
+            gr.Markdown(
+                "## Generate Your Beauty powered by EditAnything https://github.com/sail-sg/EditAnything ")
+        with gr.Row():
+            with gr.Column():
+                source_image = gr.Image(
+                    source='upload', label="Image (Upload an image and cover the region you want to edit with sketch)",  type="numpy", tool="sketch")
+                enable_all_generate = gr.Checkbox(
+                    label='Auto generation on all region.', value=False)
+                prompt = gr.Textbox(
+                    label="Prompt (Text in the expected things of edited region)")
+                enable_auto_prompt = gr.Checkbox(
+                    label='Auto generate text prompt from input image with BLIP2: Warning: Enable this may makes your prompt not working.', value=False)
+                a_prompt = gr.Textbox(
+                    label="Added Prompt", value='best quality, extremely detailed')
+                n_prompt = gr.Textbox(label="Negative Prompt",
+                                      value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
+                control_scale = gr.Slider(
+                    label="Mask Align strength (Large value means more strict alignment with SAM mask)", minimum=0, maximum=1, value=1, step=0.1)
+                run_button = gr.Button(label="Run")
+                num_samples = gr.Slider(
+                    label="Images", minimum=1, maximum=12, value=2, step=1)
+                seed = gr.Slider(label="Seed", minimum=-1,
+                                 maximum=2147483647, step=1, randomize=True)
+                with gr.Accordion("Advanced options", open=False):
                     condition_model = gr.Dropdown(choices=list(config_dict.keys()),
+                                                  value=list(
+                        config_dict.keys())[1],
+                        label='Model',
+                        multiselect=False)
+                    mask_image = gr.Image(
+                        source='upload', label="(Optional) Upload a predefined mask of edit region if you do not want to write your prompt.", type="numpy", value=None)
+                    image_resolution = gr.Slider(
+                        label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
+                    strength = gr.Slider(
+                        label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
+                    guess_mode = gr.Checkbox(
+                        label='Guess Mode', value=False)
+                    detect_resolution = gr.Slider(
+                        label="SAM Resolution", minimum=128, maximum=2048, value=1024, step=1)
+                    ddim_steps = gr.Slider(
+                        label="Steps", minimum=1, maximum=100, value=30, step=1)
+                    scale = gr.Slider(
+                        label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
+                    eta = gr.Number(label="eta (DDIM)", value=0.0)
+            with gr.Column():
+                result_gallery = gr.Gallery(
+                    label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
+                result_text = gr.Text(label='BLIP2+Human Prompt Text')
+        ips = [condition_model, source_image, enable_all_generate, mask_image, control_scale, enable_auto_prompt, prompt, a_prompt, n_prompt, num_samples, image_resolution,
+               detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta]
+        run_button.click(fn=process, inputs=ips, outputs=[
+            result_gallery, result_text])
+        # with gr.Row():
+        #     ex = gr.Examples(examples=examples, fn=process,
+        #                      inputs=[a_prompt, n_prompt, scale],
+        #                      outputs=[result_gallery],
+        #                      cache_examples=False)
+        with gr.Row():
+            gr.Markdown(WARNING_INFO)
+    return demo
 if __name__ == '__main__':
+    model = EditAnythingLoraModel(base_model_path="stabilityai/stable-diffusion-2-inpainting",
+                                  controlmodel_name='LAION Pretrained(v0-4)-SD21', extra_inpaint=False,
+                                  lora_model_path=None, use_blip=True)
+    demo = create_demo(model.process)
     demo.queue().launch(server_name='0.0.0.0')

sam2edit_beauty.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Edit Anything trained with Stable Diffusion + ControlNet + SAM  + BLIP2
+import gradio as gr
+from diffusers.utils import load_image
+from sam2edit_lora import EditAnythingLoraModel, config_dict
+def create_demo(process):
+    examples = [
+        ["dudou,1girl, beautiful face, solo, candle, brown hair, long hair, <lora:flowergirl:0.9>,ulzzang-6500-v1.1,(raw photo:1.2),((photorealistic:1.4))best quality ,masterpiece, illustration, an extremely delicate and beautiful, extremely detailed ,CG ,unity ,8k wallpaper, Amazing, finely detail, masterpiece,best quality,official art,extremely detailed CG unity 8k wallpaper,absurdres, incredibly absurdres, huge filesize, ultra-detailed, highres, extremely detailed,beautiful detailed girl, extremely detailed eyes and face, beautiful detailed eyes,cinematic lighting,1girl,see-through,looking at viewer,full body,full-body shot,outdoors,arms behind back,(chinese clothes) <lora:cuteGirlMix4_v10:1>",
+         "(((mole))),sketches, (worst quality:2), (low quality:2), (normal quality:2), lowres, normal quality, ((monochrome)), ((grayscale)), skin spots, acnes, skin blemishes, bad anatomy,(long hair:1.4),DeepNegative,(fat:1.2),facing away, looking away,tilted head, lowres,bad anatomy,bad hands, text, error, missing fingers,extra digit, fewer digits, cropped, worstquality, low quality, normal quality,jpegartifacts,signature, watermark, username,blurry,bad feet,cropped,poorly drawn hands,poorly drawn face,mutation,deformed,worst quality,low quality,normal quality,jpeg artifacts,signature,watermark,extra fingers,fewer digits,extra limbs,extra arms,extra legs,malformed limbs,fused fingers,too many fingers,long neck,cross-eyed,mutated hands,polar lowres,bad body,bad proportions,gross proportions,text,error,missing fingers,missing arms,missing legs,extra digit, extra arms, extra leg, extra foot,(freckles),(mole:2)", 5],
+        ["best quality, ultra high res, (photorealistic:1.4), (detailed beautiful girl:1.4), (medium breasts:0.8), looking_at_viewer, Detailed facial details, beautiful detailed eyes, (multicolored|blue|pink hair: 1.2), green eyes, slender, haunting smile, (makeup:0.3), red lips, <lora:cuteGirlMix4_v10:0.7>, highly detailed clothes, (ulzzang-6500-v1.1:0.3)",
+         "EasyNegative, paintings, sketches, ugly, 3d, (worst quality:2), (low quality:2), (normal quality:2), lowres, normal quality, ((monochrome)), ((grayscale)), skin spots, acnes, skin blemishes, age spot, manboobs, backlight,(ugly:1.3), (duplicate:1.3), (morbid:1.2), (mutilated:1.2), (tranny:1.3), mutated hands, (poorly drawn hands:1.3), blurry, (bad anatomy:1.2), (bad proportions:1.3), extra limbs, (disfigured:1.3), (more than 2 nipples:1.3), (more than 1 navel:1.3), (missing arms:1.3), (extra legs:1.3), (fused fingers:1.6), (too many fingers:1.6), (unclear eyes:1.3), bad hands, missing fingers, extra digit, (futa:1.1), bad body, double navel, mutad arms, hused arms, (puffy nipples, dark areolae, dark nipples, rei no himo, inverted nipples, long nipples), NG_DeepNegative_V1_75t, pubic hair, fat rolls, obese, bad-picture-chill-75v", 8],
+        ["best quality, ultra high res, (photorealistic:1.4), (detailed beautiful girl:1.4), (medium breasts:0.8), looking_at_viewer, Detailed facial details, beautiful detailed eyes, (blue|pink hair), green eyes, slender, smile, (makeup:0.4), red lips, (full body, sitting, beach), <lora:cuteGirlMix4_v10:0.7>, highly detailed clothes, (ulzzang-6500-v1.1:0.3)",
+         "asyNegative, paintings, sketches, ugly, 3d, (worst quality:2), (low quality:2), (normal quality:2), lowres, normal quality, ((monochrome)), ((grayscale)), skin spots, acnes, skin blemishes, age spot, manboobs, backlight,(ugly:1.3), (duplicate:1.3), (morbid:1.2), (mutilated:1.2), (tranny:1.3), mutated hands, (poorly drawn hands:1.3), blurry, (bad anatomy:1.2), (bad proportions:1.3), extra limbs, (disfigured:1.3), (more than 2 nipples:1.3), (more than 1 navel:1.3), (missing arms:1.3), (extra legs:1.3), (fused fingers:1.6), (too many fingers:1.6), (unclear eyes:1.3), bad hands, missing fingers, extra digit, (futa:1.1), bad body, double navel, mutad arms, hused arms, (puffy nipples, dark areolae, dark nipples, rei no himo, inverted nipples, long nipples), NG_DeepNegative_V1_75t, pubic hair, fat rolls, obese, bad-picture-chill-75v", 7],
+        ["mix4, whole body shot, ((8k, RAW photo, highest quality, masterpiece), High detail RAW color photo professional close-up photo, shy expression, cute, beautiful detailed girl, detailed fingers, extremely detailed eyes and face, beautiful detailed nose, beautiful detailed eyes, long eyelashes, light on face, looking at viewer, (closed mouth:1.2), 1girl, cute, young, mature face, (full body:1.3), ((small breasts)), realistic face, realistic body, beautiful detailed thigh,s, same eyes color, (realistic, photo realism:1. 37), (highest quality), (best shadow), (best illustration), ultra high resolution, physics-based rendering, cinematic lighting), solo, 1girl, highly detailed, in office, detailed office, open cardigan, ponytail contorted, beautiful eyes ,sitting in office,dating, business suit, cross-laced clothes, collared shirt, beautiful breast, small breast, Chinese dress, white pantyhose, natural breasts, pink and white hair, <lora:cuteGirlMix4_v10:1>",
+         "paintings, sketches, (worst quality:2), (low quality:2), (normal quality:2), cloth, underwear, bra, low-res, normal quality, ((monochrome)), ((grayscale)), skin spots, acne, skin blemishes, age spots, glans, bad nipples, long nipples, bad vagina, extra fingers,fewer fingers,strange fingers,bad hand, ng_deepnegative_v1_75t, bad-picture-chill-75v", 7]
+    ]
+    print("The GUI is not fully tested yet. Please open an issue if you find bugs.")
+    WARNING_INFO = f'''### [NOTE]  the model is collected from the Internet for demo only, please do not use it for commercial purposes.
+    We are not responsible for possible risks using this model.
+    Lora model from https://civitai.com/models/14171/cutegirlmix4 Thanks!
+    '''
+    block = gr.Blocks()
+    with block as demo:
+        with gr.Row():
+            gr.Markdown(
+                "## Generate Your Beauty powered by EditAnything https://github.com/sail-sg/EditAnything ")
+        with gr.Row():
+            with gr.Column():
+                source_image = gr.Image(
+                    source='upload', label="Image (Upload an image and cover the region you want to edit with sketch)",  type="numpy", tool="sketch")
+                enable_all_generate = gr.Checkbox(
+                    label='Auto generation on all region.', value=False)
+                prompt = gr.Textbox(
+                    label="Prompt (Text in the expected things of edited region)")
+                enable_auto_prompt = gr.Checkbox(
+                    label='Auto generate text prompt from input image with BLIP2: Warning: Enable this may makes your prompt not working.', value=False)
+                a_prompt = gr.Textbox(
+                    label="Added Prompt", value='best quality, extremely detailed')
+                n_prompt = gr.Textbox(label="Negative Prompt",
+                                      value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
+                control_scale = gr.Slider(
+                    label="Mask Align strength (Large value means more strict alignment with SAM mask)", minimum=0, maximum=1, value=1, step=0.1)
+                run_button = gr.Button(label="Run")
+                num_samples = gr.Slider(
+                    label="Images", minimum=1, maximum=12, value=2, step=1)
+                seed = gr.Slider(label="Seed", minimum=-1,
+                                 maximum=2147483647, step=1, randomize=True)
+                with gr.Accordion("Advanced options", open=False):
+                    condition_model = gr.Dropdown(choices=list(config_dict.keys()),
+                                                  value=list(
+                        config_dict.keys())[0],
+                        label='Model',
+                        multiselect=False)
+                    mask_image = gr.Image(
+                        source='upload', label="(Optional) Upload a predefined mask of edit region if you do not want to write your prompt.", type="numpy", value=None)
+                    image_resolution = gr.Slider(
+                        label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
+                    strength = gr.Slider(
+                        label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
+                    guess_mode = gr.Checkbox(
+                        label='Guess Mode', value=False)
+                    detect_resolution = gr.Slider(
+                        label="SAM Resolution", minimum=128, maximum=2048, value=1024, step=1)
+                    ddim_steps = gr.Slider(
+                        label="Steps", minimum=1, maximum=100, value=30, step=1)
+                    scale = gr.Slider(
+                        label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
+                    eta = gr.Number(label="eta (DDIM)", value=0.0)
+            with gr.Column():
+                result_gallery = gr.Gallery(
+                    label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
+                result_text = gr.Text(label='BLIP2+Human Prompt Text')
+        ips = [condition_model, source_image, enable_all_generate, mask_image, control_scale, enable_auto_prompt, prompt, a_prompt, n_prompt, num_samples, image_resolution,
+               detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta]
+        run_button.click(fn=process, inputs=ips, outputs=[
+            result_gallery, result_text])
+        with gr.Row():
+            ex = gr.Examples(examples=examples, fn=process,
+                             inputs=[a_prompt, n_prompt, scale],
+                             outputs=[result_gallery],
+                             cache_examples=False)
+        with gr.Row():
+            gr.Markdown(WARNING_INFO)
+    return demo
+if __name__ == '__main__':
+    model = EditAnythingLoraModel(base_model_path='../chilloutmix_NiPrunedFp32Fix',
+                                  lora_model_path='../40806/mix4', use_blip=True)
+    demo = create_demo(model.process)
+    demo.queue().launch(server_name='0.0.0.0')

sam2edit_handsome.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Edit Anything trained with Stable Diffusion + ControlNet + SAM  + BLIP2
+import gradio as gr
+from diffusers.utils import load_image
+from sam2edit_lora import EditAnythingLoraModel, config_dict
+def create_demo(process):
+    examples = [
+        ["1man, muscle,full body, vest, short straight hair, glasses, Gym, barbells, dumbbells, treadmills, boxing rings, squat racks, plates, dumbbell racks soft lighting, masterpiece, best quality, 8k uhd, film grain, Fujifilm XT3 photorealistic painting art by midjourney and greg rutkowski <lora:asianmale_v10:0.6>", "(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime:1.4), text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck", 6],
+        ["1man, 25 years- old, full body, wearing long-sleeve white shirt and tie, muscular rand black suit, soft lighting, masterpiece, best quality, 8k uhd, dslr, film grain, Fujifilm XT3 photorealistic painting art by midjourney and greg rutkowski <lora:asianmale_v10:0.6> <lora:uncutPenisLora_v10:0.6>","(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime:1.4), text, close up, cropped, out of frame, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck",6],
+    ]
+    print("The GUI is not fully tested yet. Please open an issue if you find bugs.")
+    WARNING_INFO = f'''### [NOTE]  the model is collected from the Internet for demo only, please do not use it for commercial purposes.
+    We are not responsible for possible risks using this model.
+    Base model from https://huggingface.co/SG161222/Realistic_Vision_V2.0 Thanks!
+    '''
+    block = gr.Blocks()
+    with block as demo:
+        with gr.Row():
+            gr.Markdown(
+                "## Generate Your Handsome powered by EditAnything https://github.com/sail-sg/EditAnything ")
+        with gr.Row():
+            with gr.Column():
+                source_image = gr.Image(
+                    source='upload', label="Image (Upload an image and cover the region you want to edit with sketch)",  type="numpy", tool="sketch")
+                enable_all_generate = gr.Checkbox(
+                    label='Auto generation on all region.', value=False)
+                prompt = gr.Textbox(
+                    label="Prompt (Text in the expected things of edited region)")
+                enable_auto_prompt = gr.Checkbox(
+                    label='Auto generate text prompt from input image with BLIP2: Warning: Enable this may makes your prompt not working.', value=False)
+                a_prompt = gr.Textbox(
+                    label="Added Prompt", value='best quality, extremely detailed')
+                n_prompt = gr.Textbox(label="Negative Prompt",
+                                        value='longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality')
+                control_scale = gr.Slider(
+                    label="Mask Align strength (Large value means more strict alignment with SAM mask)", minimum=0, maximum=1, value=1, step=0.1)
+                run_button = gr.Button(label="Run")
+                num_samples = gr.Slider(
+                    label="Images", minimum=1, maximum=12, value=2, step=1)
+                seed = gr.Slider(label="Seed", minimum=-1,
+                                    maximum=2147483647, step=1, randomize=True)
+                with gr.Accordion("Advanced options", open=False):
+                    condition_model = gr.Dropdown(choices=list(config_dict.keys()),
+                                                    value=list(
+                                                        config_dict.keys())[0],
+                                                    label='Model',
+                                                    multiselect=False)
+                    mask_image = gr.Image(
+                        source='upload', label="(Optional) Upload a predefined mask of edit region if you do not want to write your prompt.", type="numpy", value=None)
+                    image_resolution = gr.Slider(
+                        label="Image Resolution", minimum=256, maximum=768, value=512, step=64)
+                    strength = gr.Slider(
+                        label="Control Strength", minimum=0.0, maximum=2.0, value=1.0, step=0.01)
+                    guess_mode = gr.Checkbox(
+                        label='Guess Mode', value=False)
+                    detect_resolution = gr.Slider(
+                        label="SAM Resolution", minimum=128, maximum=2048, value=1024, step=1)
+                    ddim_steps = gr.Slider(
+                        label="Steps", minimum=1, maximum=100, value=30, step=1)
+                    scale = gr.Slider(
+                        label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1)
+                    eta = gr.Number(label="eta (DDIM)", value=0.0)
+            with gr.Column():
+                result_gallery = gr.Gallery(
+                    label='Output', show_label=False, elem_id="gallery").style(grid=2, height='auto')
+                result_text = gr.Text(label='BLIP2+Human Prompt Text')
+        ips = [condition_model, source_image, enable_all_generate, mask_image, control_scale, enable_auto_prompt, prompt, a_prompt, n_prompt, num_samples, image_resolution,
+                detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta]
+        run_button.click(fn=process, inputs=ips, outputs=[
+                            result_gallery, result_text])
+        with gr.Row():
+            ex = gr.Examples(examples=examples, fn=process,
+                                 inputs=[a_prompt, n_prompt, scale],
+                                 outputs=[result_gallery],
+                                 cache_examples=False)
+        with gr.Row():
+            gr.Markdown(WARNING_INFO)
+    return demo
+if __name__ == '__main__':
+    model = EditAnythingLoraModel(base_model_path= '../../gradio-rel/EditAnything/models/Realistic_Vision_V2.0',
+                 lora_model_path= '../../gradio-rel/EditAnything/models/asianmale', use_blip=True)
+    demo = create_demo(model.process)
+    demo.queue().launch(server_name='0.0.0.0')

sam2edit_lora.py ADDED Viewed

	@@ -0,0 +1,478 @@

+# Edit Anything trained with Stable Diffusion + ControlNet + SAM  + BLIP2
+from torchvision.utils import save_image
+from PIL import Image
+from pytorch_lightning import seed_everything
+import subprocess
+from collections import OrderedDict
+import re
+import cv2
+import einops
+import gradio as gr
+import numpy as np
+import torch
+import random
+import os
+import requests
+from io import BytesIO
+from annotator.util import resize_image, HWC3
+import torch
+from safetensors.torch import load_file
+from collections import defaultdict
+from diffusers import StableDiffusionControlNetPipeline
+from diffusers import ControlNetModel, UniPCMultistepScheduler
+from utils.stable_diffusion_controlnet_inpaint import StableDiffusionControlNetInpaintPipeline
+# from utils.tmp import StableDiffusionControlNetInpaintPipeline
+# need the latest transformers
+# pip install git+https://github.com/huggingface/transformers.git
+from transformers import AutoProcessor, Blip2ForConditionalGeneration
+# Segment-Anything init.
+# pip install git+https://github.com/facebookresearch/segment-anything.git
+try:
+    from segment_anything import sam_model_registry, SamAutomaticMaskGenerator
+except ImportError:
+    print('segment_anything not installed')
+    result = subprocess.run(
+        ['pip', 'install', 'git+https://github.com/facebookresearch/segment-anything.git'], check=True)
+    print(f'Install segment_anything {result}')
+    from segment_anything import sam_model_registry, SamAutomaticMaskGenerator
+if not os.path.exists('./models/sam_vit_h_4b8939.pth'):
+    result = subprocess.run(
+        ['wget', 'https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth', '-P', 'models'], check=True)
+    print(f'Download sam_vit_h_4b8939.pth {result}')
+device = "cuda" if torch.cuda.is_available() else "cpu"
+config_dict = OrderedDict([
+    ('LAION Pretrained(v0-4)-SD15', 'shgao/edit-anything-v0-4-sd15'),
+    ('LAION Pretrained(v0-4)-SD21', 'shgao/edit-anything-v0-4-sd21'),
+])
+def init_sam_model():
+    sam_checkpoint = "models/sam_vit_h_4b8939.pth"
+    model_type = "default"
+    sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
+    sam.to(device=device)
+    sam_generator = SamAutomaticMaskGenerator(sam)
+    return sam_generator
+def init_blip_processor():
+    blip_processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
+    return blip_processor
+def init_blip_model():
+    blip_model = Blip2ForConditionalGeneration.from_pretrained(
+        "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16, device_map="auto")
+    return blip_model
+def get_pipeline_embeds(pipeline, prompt, negative_prompt, device):
+    # https://github.com/huggingface/diffusers/issues/2136
+    """ Get pipeline embeds for prompts bigger than the maxlength of the pipe
+    :param pipeline:
+    :param prompt:
+    :param negative_prompt:
+    :param device:
+    :return:
+    """
+    max_length = pipeline.tokenizer.model_max_length
+    # simple way to determine length of tokens
+    count_prompt = len(re.split(r', ', prompt))
+    count_negative_prompt = len(re.split(r', ', negative_prompt))
+    # create the tensor based on which prompt is longer
+    if count_prompt >= count_negative_prompt:
+        input_ids = pipeline.tokenizer(
+            prompt, return_tensors="pt", truncation=False).input_ids.to(device)
+        shape_max_length = input_ids.shape[-1]
+        negative_ids = pipeline.tokenizer(negative_prompt, truncation=False, padding="max_length",
+                                          max_length=shape_max_length, return_tensors="pt").input_ids.to(device)
+    else:
+        negative_ids = pipeline.tokenizer(
+            negative_prompt, return_tensors="pt", truncation=False).input_ids.to(device)
+        shape_max_length = negative_ids.shape[-1]
+        input_ids = pipeline.tokenizer(prompt, return_tensors="pt", truncation=False, padding="max_length",
+                                       max_length=shape_max_length).input_ids.to(device)
+    concat_embeds = []
+    neg_embeds = []
+    for i in range(0, shape_max_length, max_length):
+        concat_embeds.append(pipeline.text_encoder(
+            input_ids[:, i: i + max_length])[0])
+        neg_embeds.append(pipeline.text_encoder(
+            negative_ids[:, i: i + max_length])[0])
+    return torch.cat(concat_embeds, dim=1), torch.cat(neg_embeds, dim=1)
+def load_lora_weights(pipeline, checkpoint_path, multiplier, device, dtype):
+    LORA_PREFIX_UNET = "lora_unet"
+    LORA_PREFIX_TEXT_ENCODER = "lora_te"
+    # load LoRA weight from .safetensors
+    if isinstance(checkpoint_path, str):
+        state_dict = load_file(checkpoint_path, device=device)
+        updates = defaultdict(dict)
+        for key, value in state_dict.items():
+            # it is suggested to print out the key, it usually will be something like below
+            # "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight"
+            layer, elem = key.split('.', 1)
+            updates[layer][elem] = value
+        # directly update weight in diffusers model
+        for layer, elems in updates.items():
+            if "text" in layer:
+                layer_infos = layer.split(
+                    LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
+                curr_layer = pipeline.text_encoder
+            else:
+                layer_infos = layer.split(
+                    LORA_PREFIX_UNET + "_")[-1].split("_")
+                curr_layer = pipeline.unet
+            # find the target layer
+            temp_name = layer_infos.pop(0)
+            while len(layer_infos) > -1:
+                try:
+                    curr_layer = curr_layer.__getattr__(temp_name)
+                    if len(layer_infos) > 0:
+                        temp_name = layer_infos.pop(0)
+                    elif len(layer_infos) == 0:
+                        break
+                except Exception:
+                    if len(temp_name) > 0:
+                        temp_name += "_" + layer_infos.pop(0)
+                    else:
+                        temp_name = layer_infos.pop(0)
+            # get elements for this layer
+            weight_up = elems['lora_up.weight'].to(dtype)
+            weight_down = elems['lora_down.weight'].to(dtype)
+            alpha = elems['alpha']
+            if alpha:
+                alpha = alpha.item() / weight_up.shape[1]
+            else:
+                alpha = 1.0
+            # update weight
+            if len(weight_up.shape) == 4:
+                curr_layer.weight.data += multiplier * alpha * torch.mm(weight_up.squeeze(
+                    3).squeeze(2), weight_down.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+            else:
+                curr_layer.weight.data += multiplier * \
+                    alpha * torch.mm(weight_up, weight_down)
+    else:
+        for ckptpath in checkpoint_path:
+            state_dict = load_file(ckptpath, device=device)
+            updates = defaultdict(dict)
+            for key, value in state_dict.items():
+                # it is suggested to print out the key, it usually will be something like below
+                # "lora_te_text_model_encoder_layers_0_self_attn_k_proj.lora_down.weight"
+                layer, elem = key.split('.', 1)
+                updates[layer][elem] = value
+            # directly update weight in diffusers model
+            for layer, elems in updates.items():
+                if "text" in layer:
+                    layer_infos = layer.split(
+                        LORA_PREFIX_TEXT_ENCODER + "_")[-1].split("_")
+                    curr_layer = pipeline.text_encoder
+                else:
+                    layer_infos = layer.split(
+                        LORA_PREFIX_UNET + "_")[-1].split("_")
+                    curr_layer = pipeline.unet
+                # find the target layer
+                temp_name = layer_infos.pop(0)
+                while len(layer_infos) > -1:
+                    try:
+                        curr_layer = curr_layer.__getattr__(temp_name)
+                        if len(layer_infos) > 0:
+                            temp_name = layer_infos.pop(0)
+                        elif len(layer_infos) == 0:
+                            break
+                    except Exception:
+                        if len(temp_name) > 0:
+                            temp_name += "_" + layer_infos.pop(0)
+                        else:
+                            temp_name = layer_infos.pop(0)
+                # get elements for this layer
+                weight_up = elems['lora_up.weight'].to(dtype)
+                weight_down = elems['lora_down.weight'].to(dtype)
+                alpha = elems['alpha']
+                if alpha:
+                    alpha = alpha.item() / weight_up.shape[1]
+                else:
+                    alpha = 1.0
+                # update weight
+                if len(weight_up.shape) == 4:
+                    curr_layer.weight.data += multiplier * alpha * torch.mm(weight_up.squeeze(
+                        3).squeeze(2), weight_down.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                else:
+                    curr_layer.weight.data += multiplier * \
+                        alpha * torch.mm(weight_up, weight_down)
+    return pipeline
+def make_inpaint_condition(image, image_mask):
+    # image = np.array(image.convert("RGB")).astype(np.float32) / 255.0
+    image = image / 255.0
+    print("img", image.max(), image.min(), image_mask.max(), image_mask.min())
+    # image_mask = np.array(image_mask.convert("L"))
+    assert image.shape[0:1] == image_mask.shape[0:
+                                                1], "image and image_mask must have the same image size"
+    image[image_mask > 128] = -1.0  # set as masked pixel
+    image = np.expand_dims(image, 0).transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return image
+def obtain_generation_model(base_model_path, lora_model_path, controlnet_path, generation_only=False, extra_inpaint=True):
+    if generation_only and extra_inpaint:
+        controlnet = ControlNetModel.from_pretrained(
+            controlnet_path, torch_dtype=torch.float16)
+        pipe = StableDiffusionControlNetPipeline.from_pretrained(
+            base_model_path, controlnet=controlnet, torch_dtype=torch.float16, safety_checker=None
+        )
+    elif extra_inpaint:
+        print("Warning: ControlNet based inpainting model only support SD1.5 for now.")
+        controlnet = [
+            ControlNetModel.from_pretrained(
+                controlnet_path, torch_dtype=torch.float16),
+            ControlNetModel.from_pretrained(
+                'lllyasviel/control_v11p_sd15_inpaint', torch_dtype=torch.float16),  # inpainting controlnet
+        ]
+        pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
+            base_model_path, controlnet=controlnet, torch_dtype=torch.float16, safety_checker=None
+        )
+    else:
+        controlnet = ControlNetModel.from_pretrained(
+            controlnet_path, torch_dtype=torch.float16)
+        pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
+            base_model_path, controlnet=controlnet, torch_dtype=torch.float16, safety_checker=None
+        )
+    if lora_model_path is not None:
+        pipe = load_lora_weights(
+            pipe, [lora_model_path], 1.0, 'cpu', torch.float32)
+    # speed up diffusion process with faster scheduler and memory optimization
+    pipe.scheduler = UniPCMultistepScheduler.from_config(
+        pipe.scheduler.config)
+    # remove following line if xformers is not installed
+    pipe.enable_xformers_memory_efficient_attention()
+    pipe.enable_model_cpu_offload()
+    return pipe
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    full_img = None
+    # for ann in sorted_anns:
+    for i in range(len(sorted_anns)):
+        ann = anns[i]
+        m = ann['segmentation']
+        if full_img is None:
+            full_img = np.zeros((m.shape[0], m.shape[1], 3))
+            map = np.zeros((m.shape[0], m.shape[1]), dtype=np.uint16)
+        map[m != 0] = i + 1
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        full_img[m != 0] = color_mask
+    full_img = full_img*255
+    # anno encoding from https://github.com/LUSSeg/ImageNet-S
+    res = np.zeros((map.shape[0], map.shape[1], 3))
+    res[:, :, 0] = map % 256
+    res[:, :, 1] = map // 256
+    res.astype(np.float32)
+    full_img = Image.fromarray(np.uint8(full_img))
+    return full_img, res
+class EditAnythingLoraModel:
+    def __init__(self,
+                 base_model_path='../chilloutmix_NiPrunedFp32Fix',
+                 lora_model_path='../40806/mix4', use_blip=True,
+                 blip_processor=None,
+                 blip_model=None,
+                 sam_generator=None,
+                 controlmodel_name='LAION Pretrained(v0-4)-SD15',
+                 # used when the base model is not an inpainting model.
+                 extra_inpaint=True,
+                 ):
+        self.device = device
+        self.use_blip = use_blip
+        # Diffusion init using diffusers.
+        self.default_controlnet_path = config_dict[controlmodel_name]
+        self.base_model_path = base_model_path
+        self.lora_model_path = lora_model_path
+        self.defalut_enable_all_generate = False
+        self.extra_inpaint = extra_inpaint
+        self.pipe = obtain_generation_model(
+            base_model_path, lora_model_path, self.default_controlnet_path, generation_only=False, extra_inpaint=extra_inpaint)
+        # Segment-Anything init.
+        if sam_generator is not None:
+            self.sam_generator = sam_generator
+        else:
+            self.sam_generator = init_sam_model()
+        # BLIP2 init.
+        if use_blip:
+            if blip_processor is not None:
+                self.blip_processor = blip_processor
+            else:
+                self.blip_processor = init_blip_processor()
+            if blip_model is not None:
+                self.blip_model = blip_model
+            else:
+                self.blip_model = init_blip_model()
+    def get_blip2_text(self, image):
+        inputs = self.blip_processor(image, return_tensors="pt").to(
+            self.device, torch.float16)
+        generated_ids = self.blip_model.generate(**inputs, max_new_tokens=50)
+        generated_text = self.blip_processor.batch_decode(
+            generated_ids, skip_special_tokens=True)[0].strip()
+        return generated_text
+    def get_sam_control(self, image):
+        masks = self.sam_generator.generate(image)
+        full_img, res = show_anns(masks)
+        return full_img, res
+    @torch.inference_mode()
+    def process(self, condition_model, source_image, enable_all_generate, mask_image, control_scale, enable_auto_prompt, prompt, a_prompt, n_prompt, num_samples, image_resolution, detect_resolution, ddim_steps, guess_mode, strength, scale, seed, eta):
+        input_image = source_image["image"]
+        if mask_image is None:
+            if enable_all_generate != self.defalut_enable_all_generate:
+                self.pipe = obtain_generation_model(
+                    self.base_model_path, self.lora_model_path, config_dict[condition_model], enable_all_generate, self.extra_inpaint)
+                self.defalut_enable_all_generate = enable_all_generate
+            if enable_all_generate:
+                print("source_image",
+                      source_image["mask"].shape, input_image.shape,)
+                mask_image = np.ones(
+                    (input_image.shape[0], input_image.shape[1], 3))*255
+            else:
+                mask_image = source_image["mask"]
+        if self.default_controlnet_path != config_dict[condition_model]:
+            print("To Use:", config_dict[condition_model],
+                  "Current:", self.default_controlnet_path)
+            print("Change condition model to:", config_dict[condition_model])
+            self.pipe = obtain_generation_model(
+                self.base_model_path, self.lora_model_path, config_dict[condition_model], enable_all_generate, self.extra_inpaint)
+            self.default_controlnet_path = config_dict[condition_model]
+            torch.cuda.empty_cache()
+        with torch.no_grad():
+            if self.use_blip and enable_auto_prompt:
+                print("Generating text:")
+                blip2_prompt = self.get_blip2_text(input_image)
+                print("Generated text:", blip2_prompt)
+                if len(prompt) > 0:
+                    prompt = blip2_prompt + ',' + prompt
+                else:
+                    prompt = blip2_prompt
+            input_image = HWC3(input_image)
+            img = resize_image(input_image, image_resolution)
+            H, W, C = img.shape
+            print("Generating SAM seg:")
+            # the default SAM model is trained with 1024 size.
+            full_segmask, detected_map = self.get_sam_control(
+                resize_image(input_image, detect_resolution))
+            detected_map = HWC3(detected_map.astype(np.uint8))
+            detected_map = cv2.resize(
+                detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
+            control = torch.from_numpy(
+                detected_map.copy()).float().cuda()
+            control = torch.stack([control for _ in range(num_samples)], dim=0)
+            control = einops.rearrange(control, 'b h w c -> b c h w').clone()
+            mask_image = HWC3(mask_image.astype(np.uint8))
+            mask_image = cv2.resize(
+                mask_image, (W, H), interpolation=cv2.INTER_LINEAR)
+            if self.extra_inpaint:
+                inpaint_image = make_inpaint_condition(img, mask_image)
+            mask_image = Image.fromarray(mask_image)
+            if seed == -1:
+                seed = random.randint(0, 65535)
+            seed_everything(seed)
+            generator = torch.manual_seed(seed)
+            postive_prompt = prompt + ', ' + a_prompt
+            negative_prompt = n_prompt
+            prompt_embeds, negative_prompt_embeds = get_pipeline_embeds(
+                self.pipe, postive_prompt, negative_prompt, "cuda")
+            prompt_embeds = torch.cat([prompt_embeds] * num_samples, dim=0)
+            negative_prompt_embeds = torch.cat(
+                [negative_prompt_embeds] * num_samples, dim=0)
+            if enable_all_generate and self.extra_inpaint:
+                print(control.shape, control_scale)
+                self.pipe.safety_checker = lambda images, clip_input: (
+                    images, False)
+                x_samples = self.pipe(
+                    prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
+                    num_images_per_prompt=num_samples,
+                    num_inference_steps=ddim_steps,
+                    generator=generator,
+                    height=H,
+                    width=W,
+                    image=control.type(torch.float16),
+                    controlnet_conditioning_scale=float(control_scale),
+                ).images
+            elif self.extra_inpaint:
+                x_samples = self.pipe(
+                    image=img,
+                    mask_image=mask_image,
+                    prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
+                    num_images_per_prompt=num_samples,
+                    num_inference_steps=ddim_steps,
+                    generator=generator,
+                    controlnet_conditioning_image=[control.type(
+                        torch.float16), inpaint_image.type(torch.float16)],
+                    height=H,
+                    width=W,
+                    controlnet_conditioning_scale=(float(control_scale), 1.0),
+                ).images
+            else:
+                x_samples = self.pipe(
+                    image=img,
+                    mask_image=mask_image,
+                    prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds,
+                    num_images_per_prompt=num_samples,
+                    num_inference_steps=ddim_steps,
+                    generator=generator,
+                    controlnet_conditioning_image=control.type(torch.float16),
+                    height=H,
+                    width=W,
+                    controlnet_conditioning_scale=float(control_scale),
+                ).images
+            results = [x_samples[i] for i in range(num_samples)]
+        return [full_segmask, mask_image] + results, prompt
+    def download_image(url):
+        response = requests.get(url)
+        return Image.open(BytesIO(response.content)).convert("RGB")

utils/stable_diffusion_controlnet_inpaint.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # Inspired by: https://github.com/haofanwang/ControlNet-for-Diffusers/
 # From https://raw.githubusercontent.com/huggingface/diffusers/53377ef83c6446033f3ee506e3ef718db817b293/examples/community/stable_diffusion_controlnet_inpaint.py
 import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
 import numpy as np
 import PIL.Image
@@ -11,6 +11,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
 from diffusers import AutoencoderKL, ControlNetModel, DiffusionPipeline, UNet2DConditionModel, logging
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import (
     PIL_INTERPOLATION,
@@ -19,7 +20,7 @@ from diffusers.utils import (
     randn_tensor,
     replace_example_docstring,
 )
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -184,7 +185,7 @@ def prepare_mask_image(mask_image):
 def prepare_controlnet_conditioning_image(
-    controlnet_conditioning_image, width, height, batch_size, num_images_per_prompt, device, dtype
 ):
     if not isinstance(controlnet_conditioning_image, torch.Tensor):
         if isinstance(controlnet_conditioning_image, PIL.Image.Image):
@@ -214,10 +215,13 @@ def prepare_controlnet_conditioning_image(
     controlnet_conditioning_image = controlnet_conditioning_image.to(device=device, dtype=dtype)
     return controlnet_conditioning_image
-class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline):
     """
     Inspired by: https://github.com/haofanwang/ControlNet-for-Diffusers/
     """
@@ -230,7 +234,7 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline):
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
         unet: UNet2DConditionModel,
-        controlnet: ControlNetModel,
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
@@ -253,7 +257,8 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline):
                 "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
@@ -522,6 +527,42 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
     def check_inputs(
         self,
         prompt,
@@ -534,6 +575,7 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline):
         negative_prompt=None,
         prompt_embeds=None,
         negative_prompt_embeds=None,
     ):
         if height % 8 != 0 or width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
@@ -572,45 +614,35 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline):
                     f" {negative_prompt_embeds.shape}."
                 )
-        controlnet_cond_image_is_pil = isinstance(controlnet_conditioning_image, PIL.Image.Image)
-        controlnet_cond_image_is_tensor = isinstance(controlnet_conditioning_image, torch.Tensor)
-        controlnet_cond_image_is_pil_list = isinstance(controlnet_conditioning_image, list) and isinstance(
-            controlnet_conditioning_image[0], PIL.Image.Image
-        )
-        controlnet_cond_image_is_tensor_list = isinstance(controlnet_conditioning_image, list) and isinstance(
-            controlnet_conditioning_image[0], torch.Tensor
-        )
-        if (
-            not controlnet_cond_image_is_pil
-            and not controlnet_cond_image_is_tensor
-            and not controlnet_cond_image_is_pil_list
-            and not controlnet_cond_image_is_tensor_list
-        ):
-            raise TypeError(
-                "image must be passed and be one of PIL image, torch tensor, list of PIL images, or list of torch tensors"
-            )
-        if controlnet_cond_image_is_pil:
-            controlnet_cond_image_batch_size = 1
-        elif controlnet_cond_image_is_tensor:
-            controlnet_cond_image_batch_size = controlnet_conditioning_image.shape[0]
-        elif controlnet_cond_image_is_pil_list:
-            controlnet_cond_image_batch_size = len(controlnet_conditioning_image)
-        elif controlnet_cond_image_is_tensor_list:
-            controlnet_cond_image_batch_size = len(controlnet_conditioning_image)
-        if prompt is not None and isinstance(prompt, str):
-            prompt_batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            prompt_batch_size = len(prompt)
-        elif prompt_embeds is not None:
-            prompt_batch_size = prompt_embeds.shape[0]
-        if controlnet_cond_image_batch_size != 1 and controlnet_cond_image_batch_size != prompt_batch_size:
-            raise ValueError(
-                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {controlnet_cond_image_batch_size}, prompt batch size: {prompt_batch_size}"
-            )
         if isinstance(image, torch.Tensor) and not isinstance(mask_image, torch.Tensor):
             raise TypeError("if `image` is a tensor, `mask_image` must also be a tensor")
@@ -630,6 +662,8 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline):
                 image_channels, image_height, image_width = image.shape
             elif image.ndim == 4:
                 image_batch_size, image_channels, image_height, image_width = image.shape
             if mask_image.ndim == 2:
                 mask_image_batch_size = 1
@@ -664,8 +698,11 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline):
         single_image_latent_channels = self.vae.config.latent_channels
-        total_latent_channels = single_image_latent_channels * 2 + mask_image_channels
         if total_latent_channels != self.unet.config.in_channels:
             raise ValueError(
                 f"The config of `pipeline.unet` expects {self.unet.config.in_channels} but received"
@@ -797,7 +834,7 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline):
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: int = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        controlnet_conditioning_scale: float = 1.0,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
@@ -897,6 +934,7 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline):
             negative_prompt,
             prompt_embeds,
             negative_prompt_embeds,
         )
         # 2. Define call parameters
@@ -913,6 +951,9 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline):
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
         # 3. Encode input prompt
         prompt_embeds = self._encode_prompt(
             prompt,
@@ -929,15 +970,37 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline):
         mask_image = prepare_mask_image(mask_image)
-        controlnet_conditioning_image = prepare_controlnet_conditioning_image(
-            controlnet_conditioning_image,
-            width,
-            height,
-            batch_size * num_images_per_prompt,
-            num_images_per_prompt,
-            device,
-            self.controlnet.dtype,
-        )
         masked_image = image * (mask_image < 0.5)
@@ -958,29 +1021,45 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline):
             latents,
         )
-        mask_image_latents = self.prepare_mask_latents(
-            mask_image,
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            do_classifier_free_guidance,
-        )
-        masked_image_latents = self.prepare_masked_image_latents(
-            masked_image,
-            batch_size * num_images_per_prompt,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            do_classifier_free_guidance,
-        )
-        if do_classifier_free_guidance:
-            controlnet_conditioning_image = torch.cat([controlnet_conditioning_image] * 2)
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -997,25 +1076,22 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline):
                 non_inpainting_latent_model_input = self.scheduler.scale_model_input(
                     non_inpainting_latent_model_input, t
                 )
-                inpainting_latent_model_input = torch.cat(
-                    [non_inpainting_latent_model_input, mask_image_latents, masked_image_latents], dim=1
-                )
                 down_block_res_samples, mid_block_res_sample = self.controlnet(
                     non_inpainting_latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
                     controlnet_cond=controlnet_conditioning_image,
                     return_dict=False,
                 )
-                down_block_res_samples = [
-                    down_block_res_sample * controlnet_conditioning_scale
-                    for down_block_res_sample in down_block_res_samples
-                ]
-                mid_block_res_sample *= controlnet_conditioning_scale
                 # predict the noise residual
                 noise_pred = self.unet(
                     inpainting_latent_model_input,
@@ -1039,6 +1115,14 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
         # If we do sequential model offloading, let's offload unet and controlnet
         # manually for max memory savings

 # Inspired by: https://github.com/haofanwang/ControlNet-for-Diffusers/
 # From https://raw.githubusercontent.com/huggingface/diffusers/53377ef83c6446033f3ee506e3ef718db817b293/examples/community/stable_diffusion_controlnet_inpaint.py
 import inspect
+from typing import Any, Callable, Dict, List, Optional, Union, Tuple
 import numpy as np
 import PIL.Image
 from diffusers import AutoencoderKL, ControlNetModel, DiffusionPipeline, UNet2DConditionModel, logging
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import (
     PIL_INTERPOLATION,
     randn_tensor,
     replace_example_docstring,
 )
+from diffusers.loaders import LoraLoaderMixin
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 def prepare_controlnet_conditioning_image(
+    controlnet_conditioning_image, width, height, batch_size, num_images_per_prompt, device, dtype, do_classifier_free_guidance,
 ):
     if not isinstance(controlnet_conditioning_image, torch.Tensor):
         if isinstance(controlnet_conditioning_image, PIL.Image.Image):
     controlnet_conditioning_image = controlnet_conditioning_image.to(device=device, dtype=dtype)
+    if do_classifier_free_guidance:
+        controlnet_conditioning_image = torch.cat([controlnet_conditioning_image] * 2)
     return controlnet_conditioning_image
+class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, LoraLoaderMixin):
     """
     Inspired by: https://github.com/haofanwang/ControlNet-for-Diffusers/
     """
         text_encoder: CLIPTextModel,
         tokenizer: CLIPTokenizer,
         unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
         scheduler: KarrasDiffusionSchedulers,
         safety_checker: StableDiffusionSafetyChecker,
         feature_extractor: CLIPImageProcessor,
                 "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
                 " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
             )
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
         self.register_modules(
             vae=vae,
             text_encoder=text_encoder,
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
+    def check_controlnet_conditioning_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list:
+            raise TypeError(
+                "image must be passed and be one of PIL image, torch tensor, list of PIL images, or list of torch tensors"
+            )
+        if image_is_pil:
+            image_batch_size = 1
+        elif image_is_tensor:
+            image_batch_size = image.shape[0]
+        elif image_is_pil_list:
+            image_batch_size = len(image)
+        elif image_is_tensor_list:
+            image_batch_size = len(image)
+        else:
+            raise ValueError("controlnet condition image is not valid")
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+        else:
+            raise ValueError("prompt or prompt_embeds are not valid")
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
     def check_inputs(
         self,
         prompt,
         negative_prompt=None,
         prompt_embeds=None,
         negative_prompt_embeds=None,
+        controlnet_conditioning_scale=None,
     ):
         if height % 8 != 0 or width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
                     f" {negative_prompt_embeds.shape}."
                 )
+        # check controlnet condition image
+        if isinstance(self.controlnet, ControlNetModel):
+            self.check_controlnet_conditioning_image(controlnet_conditioning_image, prompt, prompt_embeds)
+        elif isinstance(self.controlnet, MultiControlNetModel):
+            if not isinstance(controlnet_conditioning_image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+            if len(controlnet_conditioning_image) != len(self.controlnet.nets):
+                raise ValueError(
+                    "For multiple controlnets: `image` must have the same length as the number of controlnets."
+                )
+            for image_ in controlnet_conditioning_image:
+                self.check_controlnet_conditioning_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+        # Check `controlnet_conditioning_scale`
+        if isinstance(self.controlnet, ControlNetModel):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
         if isinstance(image, torch.Tensor) and not isinstance(mask_image, torch.Tensor):
             raise TypeError("if `image` is a tensor, `mask_image` must also be a tensor")
                 image_channels, image_height, image_width = image.shape
             elif image.ndim == 4:
                 image_batch_size, image_channels, image_height, image_width = image.shape
+            else:
+                assert False
             if mask_image.ndim == 2:
                 mask_image_batch_size = 1
         single_image_latent_channels = self.vae.config.latent_channels
+        if self.unet.config.in_channels==4:
+            total_latent_channels = single_image_latent_channels # support base model without inpainting ability.
+        else:
+            total_latent_channels = single_image_latent_channels * 2 + mask_image_channels
         if total_latent_channels != self.unet.config.in_channels:
             raise ValueError(
                 f"The config of `pipeline.unet` expects {self.unet.config.in_channels} but received"
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: int = 1,
         cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
     ):
         r"""
         Function invoked when calling the pipeline for generation.
             negative_prompt,
             prompt_embeds,
             negative_prompt_embeds,
+            controlnet_conditioning_scale,
         )
         # 2. Define call parameters
         # corresponds to doing no classifier free guidance.
         do_classifier_free_guidance = guidance_scale > 1.0
+        if isinstance(self.controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(self.controlnet.nets)
         # 3. Encode input prompt
         prompt_embeds = self._encode_prompt(
             prompt,
         mask_image = prepare_mask_image(mask_image)
+        # condition image(s)
+        if isinstance(self.controlnet, ControlNetModel):
+            controlnet_conditioning_image = prepare_controlnet_conditioning_image(
+                controlnet_conditioning_image=controlnet_conditioning_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=self.controlnet.dtype,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+            )
+        elif isinstance(self.controlnet, MultiControlNetModel):
+            controlnet_conditioning_images = []
+            for image_ in controlnet_conditioning_image:
+                image_ = prepare_controlnet_conditioning_image(
+                    controlnet_conditioning_image=image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=self.controlnet.dtype,
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                )
+                controlnet_conditioning_images.append(image_)
+            controlnet_conditioning_image = controlnet_conditioning_images
+        else:
+            assert False
         masked_image = image * (mask_image < 0.5)
             latents,
         )
+        noise = latents
+        if self.unet.config.in_channels!=4:
+            mask_image_latents = self.prepare_mask_latents(
+                mask_image,
+                batch_size * num_images_per_prompt,
+                height,
+                width,
+                prompt_embeds.dtype,
+                device,
+                do_classifier_free_guidance,
+            )
+            masked_image_latents = self.prepare_masked_image_latents(
+                masked_image,
+                batch_size * num_images_per_prompt,
+                height,
+                width,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                do_classifier_free_guidance,
+            )
+        if self.unet.config.in_channels==4:
+            init_masked_image_latents, _ = self.prepare_masked_image_latents(
+                image,
+                batch_size * num_images_per_prompt,
+                height,
+                width,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                do_classifier_free_guidance,
+            ).chunk(2)
+            print(type(mask_image), mask_image.shape)
+            _, _, w, h = mask_image.shape
+            mask_image = torch.nn.functional.interpolate(mask_image, ((w // 8, h // 8)), mode='nearest')
+            mask_image = mask_image.to(latents.device).type_as(latents)
+            mask_image = 1 - mask_image
         # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
                 non_inpainting_latent_model_input = self.scheduler.scale_model_input(
                     non_inpainting_latent_model_input, t
                 )
+                if self.unet.config.in_channels!=4:
+                    inpainting_latent_model_input = torch.cat(
+                        [non_inpainting_latent_model_input, mask_image_latents, masked_image_latents], dim=1
+                    )
+                else:
+                    inpainting_latent_model_input = non_inpainting_latent_model_input
                 down_block_res_samples, mid_block_res_sample = self.controlnet(
                     non_inpainting_latent_model_input,
                     t,
                     encoder_hidden_states=prompt_embeds,
                     controlnet_cond=controlnet_conditioning_image,
+                    conditioning_scale=controlnet_conditioning_scale,
                     return_dict=False,
                 )
                 # predict the noise residual
                 noise_pred = self.unet(
                     inpainting_latent_model_input,
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
+                # if self.unet.config.in_channels==4:
+                #     # masking for non-inpainting models
+                #     init_latents_proper = self.scheduler.add_noise(init_masked_image_latents, noise, t)
+                #     latents = (init_latents_proper * mask_image) + (latents * (1 - mask_image))
+            if self.unet.config.in_channels==4:
+                # fill the unmasked part with original image
+                latents = (init_masked_image_latents * mask_image) + (latents * (1 - mask_image))
         # If we do sequential model offloading, let's offload unet and controlnet
         # manually for max memory savings