ConsistentID

Runtime error

App Files Files Community

adaface-neurips commited on Aug 24, 2024

Commit

57aa583

1 Parent(s): 7e25f02

Separate generate_id_prompt_embeds, Remove nsfw and other useless code

Browse files

Files changed (2) hide show

app.py +10 -16
pipline_StableDiffusion_ConsistentID.py → pipline_ConsistentID.py +104 -120

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ from datetime import datetime
 from PIL import Image
 from diffusers.utils import load_image
 from diffusers import EulerDiscreteScheduler
-from pipline_StableDiffusion_ConsistentID import ConsistentIDStableDiffusionPipeline
 from huggingface_hub import hf_hub_download
 ### Model can be imported from https://github.com/zllrunning/face-parsing.PyTorch?tab=readme-ov-file
 ### We use the ckpt of 79999_iter.pth: https://drive.google.com/open?id=154JgKpzCPW82qINcVieuPH3fZ2e0P812
@@ -31,7 +31,7 @@ consistentID_path = hf_hub_download(repo_id="JackAILab/ConsistentID",
                                     local_dir="./models")
 ### Load base model
-pipe = ConsistentIDStableDiffusionPipeline.from_pretrained(
     base_model_path,
     torch_dtype=torch.float16,
     safety_checker=None, # use_safetensors=True,
@@ -54,7 +54,7 @@ pipe.load_ConsistentID_model(
     weight_name=os.path.basename(consistentID_path),
     trigger_word="img",
 )
-#pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
 ### Load to cuda
 pipe.to(device)
@@ -65,20 +65,18 @@ pipe.FacialEncoder.to(device)
 @spaces.GPU
 def process(selected_template_images, custom_image, prompt,
-            negative_prompt, prompt_selected, retouching, model_selected_tab,
             prompt_selected_tab, width, height, merge_steps, seed_set):
     if model_selected_tab==0:
-        select_images = load_image(Image.open(selected_template_images))
     else:
-        select_images = load_image(Image.fromarray(custom_image))
     if prompt_selected_tab==0:
         prompt = prompt_selected
         negative_prompt = ""
-        need_safetycheck = False
-    else:
-        need_safetycheck = True
     # hyper-parameter
     num_steps = 50
@@ -110,17 +108,14 @@ def process(selected_template_images, custom_image, prompt,
         prompt=prompt,
         width=width,
         height=height,
-        input_id_images=select_images,
         negative_prompt=negative_prompt,
         num_images_per_prompt=1,
         num_inference_steps=num_steps,
         start_merge_step=merge_steps,
         generator=generator,
-        retouching=retouching,
-        need_safetycheck=need_safetycheck,
     ).images[0]
-    current_date = datetime.today()
     return np.array(images)
 # Gets the templates
@@ -174,7 +169,6 @@ with gr.Blocks(title="ConsistentID Demo") as demo:
                 for i, tab in enumerate(prompt_selected_tabs):
                     tab.select(fn=lambda tabnum=i: tabnum, inputs=[], outputs=[prompt_selected_tab])
-            retouching = gr.Checkbox(label="face retouching",value=False,visible=False)
             width = gr.Slider(label="image width",minimum=256,maximum=768,value=512,step=8)
             height = gr.Slider(label="image height",minimum=256,maximum=768,value=768,step=8)
             width.release(lambda x,y: min(1280-x,y), inputs=[width,height], outputs=[height])
@@ -191,7 +185,7 @@ with gr.Blocks(title="ConsistentID Demo") as demo:
                 - At the same time, use prompt with \"man\" or \"woman\" instead of \"person\" as much as possible, as that may cause the model to be confused whether the protagonist is male or female.
                 - Due to insufficient graphics memory on the demo server, there is an upper limit on the resolution for generating samples. We will support the generation of SDXL as soon as possible<br/><br/>
                 ''')
-        btn.click(fn=process, inputs=[selected_template_images,custom_image,prompt,nagetive_prompt,prompt_selected,retouching
-            ,model_selected_tab,prompt_selected_tab,width,height,merge_steps,seed_set], outputs=out)
 demo.launch(server_name='0.0.0.0', ssl_verify=False)

 from PIL import Image
 from diffusers.utils import load_image
 from diffusers import EulerDiscreteScheduler
+from pipline_ConsistentID import ConsistentIDPipeline
 from huggingface_hub import hf_hub_download
 ### Model can be imported from https://github.com/zllrunning/face-parsing.PyTorch?tab=readme-ov-file
 ### We use the ckpt of 79999_iter.pth: https://drive.google.com/open?id=154JgKpzCPW82qINcVieuPH3fZ2e0P812
                                     local_dir="./models")
 ### Load base model
+pipe = ConsistentIDPipeline.from_pretrained(
     base_model_path,
     torch_dtype=torch.float16,
     safety_checker=None, # use_safetensors=True,
     weight_name=os.path.basename(consistentID_path),
     trigger_word="img",
 )
+pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
 ### Load to cuda
 pipe.to(device)
 @spaces.GPU
 def process(selected_template_images, custom_image, prompt,
+            negative_prompt, prompt_selected, model_selected_tab,
             prompt_selected_tab, width, height, merge_steps, seed_set):
+    # The gradio UI only supports one image at a time.
     if model_selected_tab==0:
+        subj_images = load_image(Image.open(selected_template_images))
     else:
+        subj_images = load_image(Image.fromarray(custom_image))
     if prompt_selected_tab==0:
         prompt = prompt_selected
         negative_prompt = ""
     # hyper-parameter
     num_steps = 50
         prompt=prompt,
         width=width,
         height=height,
+        input_subj_image_objs=subj_images,
         negative_prompt=negative_prompt,
         num_images_per_prompt=1,
         num_inference_steps=num_steps,
         start_merge_step=merge_steps,
         generator=generator,
     ).images[0]
     return np.array(images)
 # Gets the templates
                 for i, tab in enumerate(prompt_selected_tabs):
                     tab.select(fn=lambda tabnum=i: tabnum, inputs=[], outputs=[prompt_selected_tab])
             width = gr.Slider(label="image width",minimum=256,maximum=768,value=512,step=8)
             height = gr.Slider(label="image height",minimum=256,maximum=768,value=768,step=8)
             width.release(lambda x,y: min(1280-x,y), inputs=[width,height], outputs=[height])
                 - At the same time, use prompt with \"man\" or \"woman\" instead of \"person\" as much as possible, as that may cause the model to be confused whether the protagonist is male or female.
                 - Due to insufficient graphics memory on the demo server, there is an upper limit on the resolution for generating samples. We will support the generation of SDXL as soon as possible<br/><br/>
                 ''')
+        btn.click(fn=process, inputs=[selected_template_images, custom_image,prompt, nagetive_prompt, prompt_selected,
+                                      model_selected_tab, prompt_selected_tab, width, height, merge_steps, seed_set], outputs=out)
 demo.launch(server_name='0.0.0.0', ssl_verify=False)

pipline_StableDiffusion_ConsistentID.py → pipline_ConsistentID.py RENAMED Viewed

@@ -5,7 +5,6 @@ import numpy as np
 from PIL import Image
 import torch
 from torchvision import transforms
-from torchvision.utils import save_image
 from insightface.app import FaceAnalysis
 ### insight-face installation can be found at https://github.com/deepinsight/insightface
 from safetensors import safe_open
@@ -27,7 +26,7 @@ PipelineImageInput = Union[
 ]
 ### Download the pretrained model from huggingface and put it locally, then place the model in a local directory and specify the directory location.
-class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
     def cuda(self, dtype=torch.float16, use_xformers=False):
         self.to('cuda', dtype)
@@ -191,7 +190,8 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
     # parsed_image_parts2 is a batched tensor of parsed_image_parts with bs=1. It only contains the facial areas of one input image.
     # clip_encoder maps image parts to image-space diffusion prompts.
     # Then the facial class token embeddings are replaced with the fused (multi_facial_embeds, prompt_embeds[class_tokens_mask]).
-    def get_local_facial_embeds(self, prompt_embeds, negative_prompt_embeds, parsed_image_parts2, facial_token_masks, valid_facial_token_idx_mask):
         hidden_states = []
         uncond_hidden_states = []
@@ -209,8 +209,10 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         # multi_facial_embeds: [1, 5, 257, 1280].
         facial_prompt_embeds = self.FacialEncoder(prompt_embeds, multi_facial_embeds, facial_token_masks, valid_facial_token_idx_mask)
         # unconditional prompt.
-        uncond_facial_prompt_embeds = self.FacialEncoder(negative_prompt_embeds, uncond_multi_facial_embeds, facial_token_masks, valid_facial_token_idx_mask)
         return facial_prompt_embeds, uncond_facial_prompt_embeds
@@ -285,9 +287,8 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         return vis_parsing_anno_color, vis_parsing_anno
     @torch.inference_mode()
-    def extract_facemask(self, input_image_file):
-        vis_parsing_anno_color, vis_parsing_anno = self.parse_face_mask(input_image_file)
         parsing_mask_list = masks_for_unique_values(vis_parsing_anno)
         key_parsing_mask_dict = {}
@@ -307,12 +308,11 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         return key_parsing_mask_dict, vis_parsing_anno_color
-    def encode_prompt_with_trigger_word(
         self,
         prompt: str,
         face_caption: str,
         key_parsing_mask_dict = None,
-        image_token = "<|image|>",
         facial_token = "<|facial|>",
         max_num_facials = 5,
         num_id_images: int = 1,
@@ -335,9 +335,9 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
             face_caption_align =  ""
         # Remove "<|facial|>" from prompt_face.
-        # prompt_text_only: 'A person, police officer, half body shot Detail:
         # The person has one nose , two ears , two eyes , and a mouth , '
-        prompt_text_only = prompt_face.replace("<|facial|>", "").replace("<|image|>", "")
         tokenizer = self.tokenizer
         facial_token_id = tokenizer.convert_tokens_to_ids(facial_token)
         image_token_id = None
@@ -350,10 +350,10 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         image_token_idx, image_token_idx_mask, facial_token_idx, facial_token_idx_mask = \
             prepare_image_token_idx(image_token_mask, facial_token_mask, num_id_images, max_num_facials)
-        return prompt_text_only, clean_input_id, key_parsing_mask_dict_align, facial_token_mask, facial_token_idx, facial_token_idx_mask
     @torch.inference_mode()
-    def extract_parsed_image_parts(self, input_image_file, key_parsing_mask_dict, image_size=512, max_num_facials=5):
         facial_masks = []
         parsed_image_parts = []
         key_masked_raw_images_dict = {}
@@ -365,7 +365,7 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         for key in key_parsing_mask_dict:
             key_mask=key_parsing_mask_dict[key]
             facial_masks.append(transform_mask(key_mask))
-            key_masked_raw_image = apply_mask_to_raw_image(input_image_file, key_mask)
             key_masked_raw_images_dict[key] = key_masked_raw_image
             # clip_preprocessor normalizes key_masked_raw_image, so that (masked) zero pixels become non-zero.
             # It also resizes the image to 224x224.
@@ -384,14 +384,88 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         return parsed_image_parts, facial_masks, key_masked_raw_images_dict
-    def remove_unet(self):
-        unet = self.unet
-        self.unet = edict()
         # Only keep the config and in_channels attributes that are used in the pipeline.
-        self.unet.config = unet.config
-        self.unet.in_channels = unet.in_channels
-        del unet
     @torch.no_grad()
     def __call__(
         self,
@@ -414,12 +488,8 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         target_size: Optional[Tuple[int, int]] = None,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: int = 1,
-        input_id_images: PipelineImageInput = None,
         start_merge_step: int = 0,
-        class_tokens_mask: Optional[torch.LongTensor] = None,
-        text_embeds: Optional[torch.FloatTensor] = None,
-        retouching: bool=False,
-        need_safetycheck: bool=True,
     ):
         # 0. Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor
@@ -438,8 +508,8 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
             prompt_embeds,
             negative_prompt_embeds,
         )
-        if not isinstance(input_id_images, list):
-            input_id_images = [input_id_images]
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
@@ -451,81 +521,11 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         device = self._execution_device
         do_classifier_free_guidance = guidance_scale >= 1.0
-        input_image_file = input_id_images[0]
-        faceid_embeds = self.extract_faceid(face_image=input_image_file)
-        face_caption = "The person has one nose, two eyes, two ears, and a mouth."
-        key_parsing_mask_dict, vis_parsing_anno_color = self.extract_facemask(input_image_file)
         assert do_classifier_free_guidance
         # 3. Encode input prompt
-        num_id_images = len(input_id_images)
-        prompt_text_only, clean_input_id, key_parsing_mask_dict_align, \
-          facial_token_mask, facial_token_idx, facial_token_idx_mask \
-            = self.encode_prompt_with_trigger_word(
-                prompt = prompt,
-                face_caption = face_caption,
-                key_parsing_mask_dict=key_parsing_mask_dict,
-                device=device,
-                max_num_facials = 5,
-                num_id_images= num_id_images
-                )
-        # 4. Encode input prompt without the trigger word for delayed conditioning
-        # encoder_hidden_states = self.text_encoder(clean_input_id.to(device))[0]
-        prompt_embeds = self._encode_prompt(
-            prompt_text_only,
-            device=device,
-            num_images_per_prompt=num_images_per_prompt,
-            do_classifier_free_guidance=True,
-            negative_prompt=negative_prompt,
-        )
-        uncond_text_embeds = prompt_embeds[0:num_images_per_prompt]
-        text_embeds        = prompt_embeds[num_images_per_prompt:]
-        # 5. Prepare the input ID images
-        # global_id_embeds: [1, 4, 768]
-        # get_global_id_embeds() extrats OpenCLIP embeddings from the input image and map them to global face prompt embeddings.
-        global_id_embeds, uncond_global_id_embeds = \
-            self.get_global_id_embeds(faceid_embeds, face_image=input_image_file, s_scale=1.0, shortcut=False)
-        # parsed_image_parts: [5, 3, 224, 224]. 5 parts, each part is a 3-channel 224x224 image (resized by CLIP Preprocessor).
-        parsed_image_parts, facial_masks, key_masked_raw_images_dict = \
-            self.extract_parsed_image_parts(input_image_file, key_parsing_mask_dict_align, image_size=512, max_num_facials=5)
-        parsed_image_parts2 = parsed_image_parts.unsqueeze(0).to(device, dtype=self.torch_dtype)
-        facial_token_mask = facial_token_mask.to(device)
-        facial_token_idx_mask = facial_token_idx_mask.to(device)
-        # key_masked_raw_images_dict: ['Right_Eye', 'Right_Ear', 'Nose', 'Upper_Lip']
-        # for key in key_masked_raw_images_dict:
-        #     key_masked_raw_images_dict[key].save(f"{key}.png")
-        # 6. Get the update text embedding
-        # parsed_image_parts2: the facial areas of the input image
-        # text_local_id_embeds: [1, 77, 768]
-        # text_local_id_embeds only differs with text_global_id_embeds on 4 tokens, and is identical
-        # to text_global_id_embeds on the rest 73 tokens.
-        # get_local_facial_embeds() maps parsed_image_parts2 to multi_facial_embeds, and then replaces the class tokens in prompt_embeds
-        # with the fused (id_embeds, prompt_embeds[class_tokens_mask]) whose indices are specified by class_tokens_mask.
-        # parsed_image_parts2: [1, 5, 3, 224, 224]
-        text_local_id_embeds, uncond_text_local_id_embeds = \
-            self.get_local_facial_embeds(text_embeds, uncond_text_embeds, \
-                                         parsed_image_parts2, facial_token_mask, facial_token_idx_mask)
-        uncond_text_global_id_embeds  = torch.cat([uncond_text_local_id_embeds, uncond_global_id_embeds], dim=1)
-        # text_global_id_embeds, text_local_global_id_embeds: [1, 81, 768]
-        text_global_id_embeds         = torch.cat([text_embeds,          global_id_embeds], dim=1)
-        text_local_global_id_embeds   = torch.cat([text_local_id_embeds, global_id_embeds], dim=1)
-        coarse_prompt_embeds = torch.cat([uncond_text_global_id_embeds, text_global_id_embeds], dim=0)
-        fine_prompt_embeds   = torch.cat([uncond_text_global_id_embeds, text_local_global_id_embeds], dim=0)
-        # fine_prompt_embeds: the conditional part is
-        # (text_global_id_embeds + text_local_global_id_embeds) / 2.
-        fine_prompt_embeds   = (coarse_prompt_embeds + fine_prompt_embeds) / 2
         # 7. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
@@ -538,7 +538,7 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
             num_channels_latents,
             height,
             width,
-            prompt_embeds.dtype,
             device,
             generator,
             latents,
@@ -584,48 +584,32 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
                 ).prev_sample
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or (
-                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
-                ):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
         if output_type == "latent":
             image = latents
-            has_nsfw_concept = None
         elif output_type == "pil":
             # 9.1 Post-processing
             image = self.decode_latents(latents)
-            # 9.2 Run safety checker
-            if need_safetycheck:
-                image, has_nsfw_concept = self.run_safety_checker(
-                    image, device, prompt_embeds.dtype
-                )
-            else:
-                has_nsfw_concept = None
             # 9.3 Convert to PIL
             image = self.numpy_to_pil(image)
         else:
             # 9.1 Post-processing
             image = self.decode_latents(latents)
-            # 9.2 Run safety checker
-            image, has_nsfw_concept = self.run_safety_checker(
-                image, device, prompt_embeds.dtype
-            )
         # Offload last model to CPU
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.final_offload_hook.offload()
         if not return_dict:
-            return (image, has_nsfw_concept)
         return StableDiffusionPipelineOutput(
-            images=image, nsfw_content_detected=has_nsfw_concept
         )

 from PIL import Image
 import torch
 from torchvision import transforms
 from insightface.app import FaceAnalysis
 ### insight-face installation can be found at https://github.com/deepinsight/insightface
 from safetensors import safe_open
 ]
 ### Download the pretrained model from huggingface and put it locally, then place the model in a local directory and specify the directory location.
+class ConsistentIDPipeline(StableDiffusionPipeline):
     def cuda(self, dtype=torch.float16, use_xformers=False):
         self.to('cuda', dtype)
     # parsed_image_parts2 is a batched tensor of parsed_image_parts with bs=1. It only contains the facial areas of one input image.
     # clip_encoder maps image parts to image-space diffusion prompts.
     # Then the facial class token embeddings are replaced with the fused (multi_facial_embeds, prompt_embeds[class_tokens_mask]).
+    def get_local_facial_embeds(self, prompt_embeds, uncond_prompt_embeds, parsed_image_parts2,
+                                facial_token_masks, valid_facial_token_idx_mask, calc_uncond=True):
         hidden_states = []
         uncond_hidden_states = []
         # multi_facial_embeds: [1, 5, 257, 1280].
         facial_prompt_embeds = self.FacialEncoder(prompt_embeds, multi_facial_embeds, facial_token_masks, valid_facial_token_idx_mask)
+        if not calc_uncond:
+            return facial_prompt_embeds, None
         # unconditional prompt.
+        uncond_facial_prompt_embeds = self.FacialEncoder(uncond_prompt_embeds, uncond_multi_facial_embeds, facial_token_masks, valid_facial_token_idx_mask)
         return facial_prompt_embeds, uncond_facial_prompt_embeds
         return vis_parsing_anno_color, vis_parsing_anno
     @torch.inference_mode()
+    def extract_facemask(self, input_image_obj):
+        vis_parsing_anno_color, vis_parsing_anno = self.parse_face_mask(input_image_obj)
         parsing_mask_list = masks_for_unique_values(vis_parsing_anno)
         key_parsing_mask_dict = {}
         return key_parsing_mask_dict, vis_parsing_anno_color
+    def augment_prompt_with_trigger_word(
         self,
         prompt: str,
         face_caption: str,
         key_parsing_mask_dict = None,
         facial_token = "<|facial|>",
         max_num_facials = 5,
         num_id_images: int = 1,
             face_caption_align =  ""
         # Remove "<|facial|>" from prompt_face.
+        # augmented_prompt: 'A person, police officer, half body shot Detail:
         # The person has one nose , two ears , two eyes , and a mouth , '
+        augmented_prompt = prompt_face.replace("<|facial|>", "").replace("<|image|>", "")
         tokenizer = self.tokenizer
         facial_token_id = tokenizer.convert_tokens_to_ids(facial_token)
         image_token_id = None
         image_token_idx, image_token_idx_mask, facial_token_idx, facial_token_idx_mask = \
             prepare_image_token_idx(image_token_mask, facial_token_mask, num_id_images, max_num_facials)
+        return augmented_prompt, clean_input_id, key_parsing_mask_dict_align, facial_token_mask, facial_token_idx, facial_token_idx_mask
     @torch.inference_mode()
+    def extract_parsed_image_parts(self, input_image_obj, key_parsing_mask_dict, image_size=512, max_num_facials=5):
         facial_masks = []
         parsed_image_parts = []
         key_masked_raw_images_dict = {}
         for key in key_parsing_mask_dict:
             key_mask=key_parsing_mask_dict[key]
             facial_masks.append(transform_mask(key_mask))
+            key_masked_raw_image = apply_mask_to_raw_image(input_image_obj, key_mask)
             key_masked_raw_images_dict[key] = key_masked_raw_image
             # clip_preprocessor normalizes key_masked_raw_image, so that (masked) zero pixels become non-zero.
             # It also resizes the image to 224x224.
         return parsed_image_parts, facial_masks, key_masked_raw_images_dict
+    # Release the unet and vae models to save memory.
+    def release_unet_vae(self):
+        unet = edict()
         # Only keep the config and in_channels attributes that are used in the pipeline.
+        unet.config = self.unet.config
+        unet.in_channels = self.unet.in_channels
+        self.unet = unet
+        self.vae = None
+    # input_subj_image_obj: an Image object.
+    def generate_id_prompt_embeds(self, prompt, negative_prompt, input_subj_image_obj, device, calc_uncond=True):
+        faceid_embeds = self.extract_faceid(face_image=input_subj_image_obj)
+        face_caption = "The person has one nose, two eyes, two ears, and a mouth."
+        key_parsing_mask_dict, vis_parsing_anno_color = self.extract_facemask(input_subj_image_obj)
+        augmented_prompt, clean_input_id, key_parsing_mask_dict_align, \
+          facial_token_mask, facial_token_idx, facial_token_idx_mask \
+            = self.augment_prompt_with_trigger_word(
+                prompt = prompt,
+                face_caption = face_caption,
+                key_parsing_mask_dict=key_parsing_mask_dict,
+                device=device,
+                max_num_facials = 5,
+                num_id_images = 1
+                )
+        text_embeds, uncond_text_embeds = self.encode_prompt(
+            augmented_prompt,
+            device=device,
+            num_images_per_prompt=1,
+            do_classifier_free_guidance=calc_uncond,
+            negative_prompt=negative_prompt,
+        )
+        # 5. Prepare the input ID images
+        # global_id_embeds: [1, 4, 768]
+        # get_global_id_embeds() extrats OpenCLIP embeddings from the input image and map them to global face prompt embeddings.
+        global_id_embeds, uncond_global_id_embeds = \
+            self.get_global_id_embeds(faceid_embeds, face_image=input_subj_image_obj, s_scale=1.0, shortcut=False)
+        # parsed_image_parts: [5, 3, 224, 224]. 5 parts, each part is a 3-channel 224x224 image (resized by CLIP Preprocessor).
+        parsed_image_parts, facial_masks, key_masked_raw_images_dict = \
+            self.extract_parsed_image_parts(input_subj_image_obj, key_parsing_mask_dict_align, image_size=512, max_num_facials=5)
+        parsed_image_parts2 = parsed_image_parts.unsqueeze(0).to(device, dtype=self.torch_dtype)
+        facial_token_mask = facial_token_mask.to(device)
+        facial_token_idx_mask = facial_token_idx_mask.to(device)
+        # key_masked_raw_images_dict: ['Right_Eye', 'Right_Ear', 'Nose', 'Upper_Lip']
+        # for key in key_masked_raw_images_dict:
+        #     key_masked_raw_images_dict[key].save(f"{key}.png")
+        # 6. Get the update text embedding
+        # parsed_image_parts2: the facial areas of the input image
+        # text_local_id_embeds: [1, 77, 768]
+        # text_local_id_embeds only differs with text_global_id_embeds on 4 tokens, and is identical
+        # to text_global_id_embeds on the rest 73 tokens.
+        # get_local_facial_embeds() maps parsed_image_parts2 to multi_facial_embeds, and then replaces the class tokens in prompt_embeds
+        # with the fused (id_embeds, prompt_embeds[class_tokens_mask]) whose indices are specified by class_tokens_mask.
+        # parsed_image_parts2: [1, 5, 3, 224, 224]
+        text_local_id_embeds, uncond_text_local_id_embeds = \
+            self.get_local_facial_embeds(text_embeds, uncond_text_embeds, \
+                                         parsed_image_parts2, facial_token_mask, facial_token_idx_mask,
+                                         calc_uncond=calc_uncond)
+        # text_global_id_embeds, text_local_global_id_embeds: [1, 81, 768]
+        text_global_id_embeds         = torch.cat([text_embeds,          global_id_embeds], dim=1)
+        text_local_global_id_embeds   = torch.cat([text_local_id_embeds, global_id_embeds], dim=1)
+        if calc_uncond:
+            uncond_text_global_id_embeds  = torch.cat([uncond_text_local_id_embeds, uncond_global_id_embeds], dim=1)
+            coarse_prompt_embeds = torch.cat([uncond_text_global_id_embeds, text_global_id_embeds], dim=0)
+            fine_prompt_embeds   = torch.cat([uncond_text_global_id_embeds, text_local_global_id_embeds], dim=0)
+        else:
+            coarse_prompt_embeds = text_global_id_embeds
+            fine_prompt_embeds   = text_local_global_id_embeds
+        # fine_prompt_embeds: the conditional part is
+        # (text_global_id_embeds + text_local_global_id_embeds) / 2.
+        fine_prompt_embeds   = (coarse_prompt_embeds + fine_prompt_embeds) / 2
+        return coarse_prompt_embeds, fine_prompt_embeds
     @torch.no_grad()
     def __call__(
         self,
         target_size: Optional[Tuple[int, int]] = None,
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: int = 1,
+        input_subj_image_objs: PipelineImageInput = None,
         start_merge_step: int = 0,
     ):
         # 0. Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor
             prompt_embeds,
             negative_prompt_embeds,
         )
+        if not isinstance(input_subj_image_objs, list):
+            input_subj_image_objs = [input_subj_image_objs]
         # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
         device = self._execution_device
         do_classifier_free_guidance = guidance_scale >= 1.0
         assert do_classifier_free_guidance
         # 3. Encode input prompt
+        coarse_prompt_embeds, fine_prompt_embeds = \
+            self.generate_id_prompt_embeds(prompt, negative_prompt, input_subj_image_objs[0], device)
         # 7. Prepare timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
             num_channels_latents,
             height,
             width,
+            coarse_prompt_embeds.dtype,
             device,
             generator,
             latents,
                 ).prev_sample
                 # call the callback, if provided
+                if i == len(timesteps) - 1 or \
+                  ( (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 ):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
         if output_type == "latent":
             image = latents
         elif output_type == "pil":
             # 9.1 Post-processing
             image = self.decode_latents(latents)
             # 9.3 Convert to PIL
             image = self.numpy_to_pil(image)
         else:
             # 9.1 Post-processing
             image = self.decode_latents(latents)
         # Offload last model to CPU
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.final_offload_hook.offload()
         if not return_dict:
+            return (image, None)
         return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=None
         )