Spaces:

zyt334
/

Adaface

Sleeping

App Files Files Community

zyt334 commited on Jul 30

Commit

57f11a4

•

1 Parent(s): bc971c6

Upload folder using huggingface_hub

Browse files

Files changed (40) hide show

.gitattributes +35 -35
README.md +12 -12
adaface-infer.py +131 -0
adaface-translate.py +208 -0
adaface/__pycache__/adaface_wrapper.cpython-312.pyc +0 -0
adaface/__pycache__/adaface_wrapper.cpython-38.pyc +0 -0
adaface/__pycache__/arc2face_models.cpython-312.pyc +0 -0
adaface/__pycache__/arc2face_models.cpython-38.pyc +0 -0
adaface/__pycache__/subj_basis_generator.cpython-312.pyc +0 -0
adaface/__pycache__/subj_basis_generator.cpython-38.pyc +0 -0
adaface/__pycache__/util.cpython-312.pyc +0 -0
adaface/__pycache__/util.cpython-38.pyc +0 -0
adaface/adaface-infer.py +131 -0
adaface/adaface-translate.py +208 -0
adaface/adaface_wrapper.py +297 -0
adaface/arc2face_models.py +303 -0
adaface/subj_basis_generator.py +758 -0
adaface/util.py +342 -0
adaface_wrapper.py +297 -0
app.py +203 -0
arc2face_models.py +303 -0
models/adaface/subjects-celebrity2024-05-16T17-22-46_zero3-ada-30000.pt +3 -0
models/arc2face/arc2face/config.json +67 -0
models/arc2face/arc2face/diffusion_pytorch_model.safetensors +3 -0
models/arc2face/encoder/config.json +24 -0
models/arc2face/encoder/pytorch_model.bin +3 -0
models/insightface/models/antelopev2/1k3d68.onnx +3 -0
models/insightface/models/antelopev2/2d106det.onnx +3 -0
models/insightface/models/antelopev2/arcface.onnx +3 -0
models/insightface/models/antelopev2/genderage.onnx +3 -0
models/insightface/models/antelopev2/scrfd_10g_bnkps.onnx +3 -0
models/insightface/models/buffalo_l/1k3d68.onnx +3 -0
models/insightface/models/buffalo_l/2d106det.onnx +3 -0
models/insightface/models/buffalo_l/det_10g.onnx +3 -0
models/insightface/models/buffalo_l/genderage.onnx +3 -0
models/insightface/models/buffalo_l/w600k_r50.onnx +3 -0
models/sar/sar.safetensors +3 -0
requirements.txt +12 -0
subj_basis_generator.py +758 -0
util.py +342 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,35 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
----
-title: Adaface
-emoji: 🚀
-colorFrom: gray
-colorTo: indigo
-sdk: gradio
-sdk_version: 4.39.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Adaface
+emoji: 🌖
+colorFrom: purple
+colorTo: red
+sdk: gradio
+sdk_version: 4.37.2
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

adaface-infer.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from adaface.adaface_wrapper import AdaFaceWrapper
+import torch
+#import torch.nn.functional as F
+from PIL import Image
+import numpy as np
+import os, argparse, glob, re
+def save_images(images, num_images_per_row, subject_name, prompt, noise_level, save_dir = "samples-ada"):
+    if num_images_per_row > len(images):
+        num_images_per_row = len(images)
+    os.makedirs(save_dir, exist_ok=True)
+    num_columns = int(np.ceil(len(images) / num_images_per_row))
+    # Save 4 images as a grid image in save_dir
+    grid_image = Image.new('RGB', (512 * num_images_per_row, 512 * num_columns))
+    for i, image in enumerate(images):
+        image = image.resize((512, 512))
+        grid_image.paste(image, (512 * (i % num_images_per_row), 512 * (i // num_images_per_row)))
+    prompt_sig = prompt.replace(" ", "_").replace(",", "_")
+    grid_filepath = os.path.join(save_dir, f"{subject_name}-{prompt_sig}-noise{noise_level:.02f}.png")
+    if os.path.exists(grid_filepath):
+        grid_count = 2
+        grid_filepath = os.path.join(save_dir, f'{subject_name}-{prompt_sig}-noise{noise_level:.02f}-{grid_count}.jpg')
+        while os.path.exists(grid_filepath):
+            grid_count += 1
+            grid_filepath = os.path.join(save_dir, f'{subject_name}-{prompt_sig}-noise{noise_level:.02f}-{grid_count}.jpg')
+    grid_image.save(grid_filepath)
+    print(f"Saved to {grid_filepath}")
+def seed_everything(seed):
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    os.environ["PL_GLOBAL_SEED"] = str(seed)
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_model_path", type=str, default='runwayml/stable-diffusion-v1-5',
+                        help="Type of checkpoints to use (default: SD 1.5)")
+    parser.add_argument("--embman_ckpt", type=str, required=True,
+                        help="Path to the checkpoint of the embedding manager")
+    parser.add_argument("--subject", type=str, required=True)
+    parser.add_argument("--example_image_count", type=int, default=-1, help="Number of example images to use")
+    parser.add_argument("--out_image_count",     type=int, default=4,  help="Number of images to generate")
+    parser.add_argument("--prompt", type=str, default="a woman z in superman costume")
+    parser.add_argument("--noise", dest='noise_level', type=float, default=0)
+    parser.add_argument("--randface", action="store_true")
+    parser.add_argument("--scale", dest='guidance_scale', type=float, default=4,
+                        help="Guidance scale for the diffusion model")
+    parser.add_argument("--id_cfg_scale", type=float, default=1,
+                        help="CFG scale when generating the identity embeddings")
+    parser.add_argument("--subject_string",
+                        type=str, default="z",
+                        help="Subject placeholder string used in prompts to denote the concept.")
+    parser.add_argument("--num_vectors", type=int, default=16,
+                        help="Number of vectors used to represent the subject.")
+    parser.add_argument("--num_images_per_row", type=int, default=4,
+                        help="Number of images to display in a row in the output grid image.")
+    parser.add_argument("--num_inference_steps", type=int, default=50,
+                        help="Number of DDIM inference steps")
+    parser.add_argument("--device", type=str, default="cuda", help="Device to run the model on")
+    parser.add_argument("--seed", type=int, default=42,
+                        help="the seed (for reproducible sampling). Set to -1 to disable.")
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = parse_args()
+    if args.seed != -1:
+        seed_everything(args.seed)
+    if re.match(r"^\d+$", args.device):
+        args.device = f"cuda:{args.device}"
+    print(f"Using device {args.device}")
+    adaface = AdaFaceWrapper("text2img", args.base_model_path, args.embman_ckpt, args.device,
+                             args.subject_string, args.num_vectors, args.num_inference_steps)
+    if not args.randface:
+        image_folder = args.subject
+        if image_folder.endswith("/"):
+            image_folder = image_folder[:-1]
+        if os.path.isfile(image_folder):
+            # Get the second to the last part of the path
+            subject_name = os.path.basename(os.path.dirname(image_folder))
+            image_paths = [image_folder]
+        else:
+            subject_name = os.path.basename(image_folder)
+            image_types = ["*.jpg", "*.png", "*.jpeg"]
+            alltype_image_paths = []
+            for image_type in image_types:
+                # glob returns the full path.
+                image_paths = glob.glob(os.path.join(image_folder, image_type))
+                if len(image_paths) > 0:
+                    alltype_image_paths.extend(image_paths)
+            # Filter out images of "*_mask.png"
+            alltype_image_paths = [image_path for image_path in alltype_image_paths if "_mask.png" not in image_path]
+            # image_paths contain at most args.example_image_count full image paths.
+            if args.example_image_count > 0:
+                image_paths = alltype_image_paths[:args.example_image_count]
+            else:
+                image_paths = alltype_image_paths
+    else:
+        subject_name = None
+        image_paths = None
+        image_folder = None
+    subject_name = "randface-" + str(torch.seed()) if args.randface else subject_name
+    rand_face_embs = torch.randn(1, 512)
+    pre_face_embs = rand_face_embs if args.randface else None
+    noise = torch.randn(args.out_image_count, 4, 64, 64).cuda()
+    # args.noise_level: the *relative* std of the noise added to the face embeddings.
+    # A noise level of 0.08 could change gender, but 0.06 is usually safe.
+    # adaface_subj_embs is not used. It is generated for the purpose of updating the text encoder (within this function call).
+    adaface_subj_embs = adaface.generate_adaface_embeddings(image_paths, image_folder, pre_face_embs, args.randface,
+                                                            out_id_embs_scale=args.id_cfg_scale, noise_level=args.noise_level,
+                                                            update_text_encoder=True)
+    images = adaface(noise, args.prompt, args.guidance_scale, args.out_image_count, verbose=True)
+    save_images(images, args.num_images_per_row, subject_name, f"guide{args.guidance_scale}", args.noise_level)

adaface-translate.py ADDED Viewed

	@@ -0,0 +1,208 @@

+from adaface.adaface_wrapper import AdaFaceWrapper
+import torch
+#import torch.nn.functional as F
+from PIL import Image
+import numpy as np
+import os, argparse, glob, re, shutil
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+def seed_everything(seed):
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    os.environ["PL_GLOBAL_SEED"] = str(seed)
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_model_path", type=str, default='models/realisticvision/realisticVisionV40_v40VAE.safetensors',
+                        help="Path to the UNet checkpoint (default: RealisticVision 4.0)")
+    parser.add_argument("--embman_ckpt", type=str, required=True,
+                        help="Path to the checkpoint of the embedding manager")
+    parser.add_argument("--in_folder",  type=str, required=True, help="Path to the folder containing input images")
+    # If True, the input folder contains images of mixed subjects.
+    # If False, the input folder contains multiple subfolders, each of which contains images of the same subject.
+    parser.add_argument("--is_mix_subj_folder", type=str2bool, const=True, default=False, nargs="?",
+                        help="Whether the input folder contains images of mixed subjects")
+    parser.add_argument("--max_images_per_subject", type=int, default=5, help="Number of example images used per subject")
+    parser.add_argument("--trans_subject_count", type=int, default=-1, help="Number of example images to be translated")
+    parser.add_argument("--out_folder", type=str, required=True, help="Path to the folder saving output images")
+    parser.add_argument("--out_count_per_input_image", type=int, default=1,  help="Number of output images to generate per input image")
+    parser.add_argument("--copy_masks", action="store_true", help="Copy the mask images to the output folder")
+    parser.add_argument("--noise", dest='noise_level', type=float, default=0)
+    parser.add_argument("--scale", dest='guidance_scale', type=float, default=4,
+                        help="Guidance scale for the diffusion model")
+    parser.add_argument("--ref_img_strength", type=float, default=0.8,
+                        help="Strength of the reference image in the output image.")
+    parser.add_argument("--subject_string",
+                        type=str, default="z",
+                        help="Subject placeholder string used in prompts to denote the concept.")
+    parser.add_argument("--num_vectors", type=int, default=16,
+                        help="Number of vectors used to represent the subject.")
+    parser.add_argument("--prompt", type=str, default="a person z")
+    parser.add_argument("--num_images_per_row", type=int, default=4,
+                        help="Number of images to display in a row in the output grid image.")
+    parser.add_argument("--num_inference_steps", type=int, default=50,
+                        help="Number of DDIM inference steps")
+    parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use. If num_gpus > 1, use accelerate for distributed execution.")
+    parser.add_argument("--device", type=str, default="cuda", help="Device to run the model on")
+    parser.add_argument("--seed", type=int, default=42,
+                        help="the seed (for reproducible sampling). Set to -1 to disable.")
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = parse_args()
+    if args.seed != -1:
+        seed_everything(args.seed)
+# screen -dm -L -Logfile trans_rv4-2.txt accelerate launch --multi_gpu --num_processes=2 scripts/adaface-translate.py
+# --embman_ckpt logs/subjects-celebrity2024-05-16T17-22-46_zero3-ada/checkpoints/embeddings_gs-30000.pt
+# --base_model_path models/realisticvision/realisticVisionV40_v40VAE.safetensors --in_folder /data/username/VGGface2_HQ_masks/
+# --is_mix_subj_folder 0 --out_folder /data/username/VGGface2_HQ_masks_rv4a --copy_masks --num_gpus 2
+    if args.num_gpus > 1:
+        from accelerate import PartialState
+        distributed_state = PartialState()
+        args.device = distributed_state.device
+        process_index = distributed_state.process_index
+    elif re.match(r"^\d+$", args.device):
+        args.device = f"cuda:{args.device}"
+        distributed_state = None
+        process_index = 0
+    adaface = AdaFaceWrapper("img2img", args.base_model_path, args.embman_ckpt, args.device,
+                             args.subject_string, args.num_vectors, args.num_inference_steps)
+    in_folder = args.in_folder
+    if os.path.isfile(in_folder):
+        subject_folders = [ os.path.dirname(in_folder) ]
+        images_by_subject = [[in_folder]]
+    else:
+        if not args.is_mix_subj_folder:
+            in_folders = [in_folder]
+        else:
+            in_folders = [ os.path.join(in_folder, subfolder) for subfolder in sorted(os.listdir(in_folder)) ]
+        images_by_subject = []
+        subject_folders   = []
+        for in_folder in in_folders:
+            image_types = ["*.jpg", "*.png", "*.jpeg"]
+            alltype_image_paths = []
+            for image_type in image_types:
+                # glob returns the full path.
+                image_paths = glob.glob(os.path.join(in_folder, image_type))
+                if len(image_paths) > 0:
+                    alltype_image_paths.extend(image_paths)
+            # Filter out images of "*_mask.png"
+            alltype_image_paths = [image_path for image_path in alltype_image_paths if "_mask.png" not in image_path]
+            alltype_image_paths = sorted(alltype_image_paths)
+            if not args.is_mix_subj_folder:
+                # image_paths contain at most args.max_images_per_subject full image paths.
+                if args.max_images_per_subject > 0:
+                    image_paths = alltype_image_paths[:args.max_images_per_subject]
+                else:
+                    image_paths = alltype_image_paths
+                images_by_subject.append(image_paths)
+                subject_folders.append(in_folder)
+            else:
+                # Each image in the folder is treated as an individual subject.
+                images_by_subject.extend([[image_path] for image_path in alltype_image_paths])
+                subject_folders.extend([in_folder] * len(alltype_image_paths))
+            if args.trans_subject_count > 0 and len(subject_folders) >= args.trans_subject_count:
+                break
+    if args.trans_subject_count > 0:
+        images_by_subject = images_by_subject[:args.trans_subject_count]
+        subject_folders   = subject_folders[:args.trans_subject_count]
+    out_image_count = 0
+    out_mask_count  = 0
+    if not args.out_folder.endswith("/"):
+        args.out_folder += "/"
+    if args.num_gpus > 1:
+        # Split the subjects across the GPUs.
+        subject_folders = subject_folders[process_index::args.num_gpus]
+        images_by_subject = images_by_subject[process_index::args.num_gpus]
+        #subject_folders, images_by_subject = distributed_state.split_between_processes(zip(subject_folders, images_by_subject))
+    for (subject_folder, image_paths) in zip(subject_folders, images_by_subject):
+        # If is_mix_subj_folder, then image_paths only contains 1 image, and we use the file name as the signature of the image.
+        # Otherwise, we use the folder name as the signature of the images.
+        images_sig = subject_folder if not args.is_mix_subj_folder else os.path.basename(image_paths[0])
+        print(f"Translating {images_sig}...")
+        with torch.no_grad():
+            adaface_subj_embs = adaface.generate_adaface_embeddings(image_paths, subject_folder, None, False,
+                                                                    out_id_embs_scale=1, noise_level=args.noise_level,
+                                                                    update_text_encoder=True)
+        # Replace the first occurrence of "in_folder" with "out_folder" in the path of the subject_folder.
+        subject_out_folder = subject_folder.replace(args.in_folder, args.out_folder, 1)
+        if not os.path.exists(subject_out_folder):
+            os.makedirs(subject_out_folder)
+        print(f"Output images will be saved to {subject_out_folder}")
+        in_images = []
+        for image_path in image_paths:
+            image = Image.open(image_path).convert("RGB").resize((512, 512))
+            # [512, 512, 3] -> [3, 512, 512].
+            image = np.array(image).transpose(2, 0, 1)
+            # Convert the image to a tensor of shape (1, 3, 512, 512) and move it to the GPU.
+            image = torch.tensor(image).unsqueeze(0).float().cuda()
+            in_images.append(image)
+        # Put all input images of the subject into a batch. This assumes max_images_per_subject is small.
+        # NOTE: For simplicity, we do not check overly large batch sizes.
+        in_images = torch.cat(in_images, dim=0)
+        # in_images: [5, 3, 512, 512].
+        # Normalize the pixel values to [0, 1].
+        in_images = in_images / 255.0
+        num_out_images = len(in_images) * args.out_count_per_input_image
+        with torch.no_grad():
+            # args.noise_level: the *relative* std of the noise added to the face embeddings.
+            # A noise level of 0.08 could change gender, but 0.06 is usually safe.
+            # The returned adaface_subj_embs are already incorporated in the text encoder, and not used explicitly.
+            # NOTE: We assume out_count_per_input_image == 1, so that the output images are of the same number as the input images.
+            out_images = adaface(in_images, args.prompt, args.guidance_scale, num_out_images, ref_img_strength=args.ref_img_strength)
+            for img_i, img in enumerate(out_images):
+                # out_images: subj_1, subj_2, ..., subj_n, subj_1, subj_2, ..., subj_n, ...
+                subj_i = img_i %  len(in_images)
+                copy_i = img_i // len(in_images)
+                image_filename_stem, image_fileext = os.path.splitext(os.path.basename(image_paths[subj_i]))
+                if copy_i == 0:
+                    img.save(os.path.join(subject_out_folder, f"{image_filename_stem}{image_fileext}"))
+                else:
+                    img.save(os.path.join(subject_out_folder, f"{image_filename_stem}_{copy_i}{image_fileext}"))
+                if args.copy_masks:
+                    mask_path = image_paths[subj_i].replace(image_fileext, "_mask.png")
+                    if os.path.exists(mask_path):
+                        if copy_i == 0:
+                            shutil.copy(mask_path, subject_out_folder)
+                        else:
+                            mask_filename_stem = image_filename_stem
+                            shutil.copy(mask_path, os.path.join(subject_out_folder, f"{mask_filename_stem}_{copy_i}_mask.png"))
+                        out_mask_count += 1
+            out_image_count += len(out_images)
+    print(f"{out_image_count} output images and {out_mask_count} masks saved to {args.out_folder}")

adaface/__pycache__/adaface_wrapper.cpython-312.pyc ADDED Viewed

Binary file (13.5 kB). View file

adaface/__pycache__/adaface_wrapper.cpython-38.pyc ADDED Viewed

Binary file (8.03 kB). View file

adaface/__pycache__/arc2face_models.cpython-312.pyc ADDED Viewed

Binary file (16.1 kB). View file

adaface/__pycache__/arc2face_models.cpython-38.pyc ADDED Viewed

Binary file (7 kB). View file

adaface/__pycache__/subj_basis_generator.cpython-312.pyc ADDED Viewed

Binary file (30.1 kB). View file

adaface/__pycache__/subj_basis_generator.cpython-38.pyc ADDED Viewed

Binary file (17.6 kB). View file

adaface/__pycache__/util.cpython-312.pyc ADDED Viewed

Binary file (14 kB). View file

adaface/__pycache__/util.cpython-38.pyc ADDED Viewed

Binary file (8.57 kB). View file

adaface/adaface-infer.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from adaface.adaface_wrapper import AdaFaceWrapper
+import torch
+#import torch.nn.functional as F
+from PIL import Image
+import numpy as np
+import os, argparse, glob, re
+def save_images(images, num_images_per_row, subject_name, prompt, noise_level, save_dir = "samples-ada"):
+    if num_images_per_row > len(images):
+        num_images_per_row = len(images)
+    os.makedirs(save_dir, exist_ok=True)
+    num_columns = int(np.ceil(len(images) / num_images_per_row))
+    # Save 4 images as a grid image in save_dir
+    grid_image = Image.new('RGB', (512 * num_images_per_row, 512 * num_columns))
+    for i, image in enumerate(images):
+        image = image.resize((512, 512))
+        grid_image.paste(image, (512 * (i % num_images_per_row), 512 * (i // num_images_per_row)))
+    prompt_sig = prompt.replace(" ", "_").replace(",", "_")
+    grid_filepath = os.path.join(save_dir, f"{subject_name}-{prompt_sig}-noise{noise_level:.02f}.png")
+    if os.path.exists(grid_filepath):
+        grid_count = 2
+        grid_filepath = os.path.join(save_dir, f'{subject_name}-{prompt_sig}-noise{noise_level:.02f}-{grid_count}.jpg')
+        while os.path.exists(grid_filepath):
+            grid_count += 1
+            grid_filepath = os.path.join(save_dir, f'{subject_name}-{prompt_sig}-noise{noise_level:.02f}-{grid_count}.jpg')
+    grid_image.save(grid_filepath)
+    print(f"Saved to {grid_filepath}")
+def seed_everything(seed):
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    os.environ["PL_GLOBAL_SEED"] = str(seed)
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_model_path", type=str, default='runwayml/stable-diffusion-v1-5',
+                        help="Type of checkpoints to use (default: SD 1.5)")
+    parser.add_argument("--embman_ckpt", type=str, required=True,
+                        help="Path to the checkpoint of the embedding manager")
+    parser.add_argument("--subject", type=str, required=True)
+    parser.add_argument("--example_image_count", type=int, default=-1, help="Number of example images to use")
+    parser.add_argument("--out_image_count",     type=int, default=4,  help="Number of images to generate")
+    parser.add_argument("--prompt", type=str, default="a woman z in superman costume")
+    parser.add_argument("--noise", dest='noise_level', type=float, default=0)
+    parser.add_argument("--randface", action="store_true")
+    parser.add_argument("--scale", dest='guidance_scale', type=float, default=4,
+                        help="Guidance scale for the diffusion model")
+    parser.add_argument("--id_cfg_scale", type=float, default=1,
+                        help="CFG scale when generating the identity embeddings")
+    parser.add_argument("--subject_string",
+                        type=str, default="z",
+                        help="Subject placeholder string used in prompts to denote the concept.")
+    parser.add_argument("--num_vectors", type=int, default=16,
+                        help="Number of vectors used to represent the subject.")
+    parser.add_argument("--num_images_per_row", type=int, default=4,
+                        help="Number of images to display in a row in the output grid image.")
+    parser.add_argument("--num_inference_steps", type=int, default=50,
+                        help="Number of DDIM inference steps")
+    parser.add_argument("--device", type=str, default="cuda", help="Device to run the model on")
+    parser.add_argument("--seed", type=int, default=42,
+                        help="the seed (for reproducible sampling). Set to -1 to disable.")
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = parse_args()
+    if args.seed != -1:
+        seed_everything(args.seed)
+    if re.match(r"^\d+$", args.device):
+        args.device = f"cuda:{args.device}"
+    print(f"Using device {args.device}")
+    adaface = AdaFaceWrapper("text2img", args.base_model_path, args.embman_ckpt, args.device,
+                             args.subject_string, args.num_vectors, args.num_inference_steps)
+    if not args.randface:
+        image_folder = args.subject
+        if image_folder.endswith("/"):
+            image_folder = image_folder[:-1]
+        if os.path.isfile(image_folder):
+            # Get the second to the last part of the path
+            subject_name = os.path.basename(os.path.dirname(image_folder))
+            image_paths = [image_folder]
+        else:
+            subject_name = os.path.basename(image_folder)
+            image_types = ["*.jpg", "*.png", "*.jpeg"]
+            alltype_image_paths = []
+            for image_type in image_types:
+                # glob returns the full path.
+                image_paths = glob.glob(os.path.join(image_folder, image_type))
+                if len(image_paths) > 0:
+                    alltype_image_paths.extend(image_paths)
+            # Filter out images of "*_mask.png"
+            alltype_image_paths = [image_path for image_path in alltype_image_paths if "_mask.png" not in image_path]
+            # image_paths contain at most args.example_image_count full image paths.
+            if args.example_image_count > 0:
+                image_paths = alltype_image_paths[:args.example_image_count]
+            else:
+                image_paths = alltype_image_paths
+    else:
+        subject_name = None
+        image_paths = None
+        image_folder = None
+    subject_name = "randface-" + str(torch.seed()) if args.randface else subject_name
+    rand_face_embs = torch.randn(1, 512)
+    pre_face_embs = rand_face_embs if args.randface else None
+    noise = torch.randn(args.out_image_count, 4, 64, 64).cuda()
+    # args.noise_level: the *relative* std of the noise added to the face embeddings.
+    # A noise level of 0.08 could change gender, but 0.06 is usually safe.
+    # adaface_subj_embs is not used. It is generated for the purpose of updating the text encoder (within this function call).
+    adaface_subj_embs = adaface.generate_adaface_embeddings(image_paths, image_folder, pre_face_embs, args.randface,
+                                                            out_id_embs_scale=args.id_cfg_scale, noise_level=args.noise_level,
+                                                            update_text_encoder=True)
+    images = adaface(noise, args.prompt, args.guidance_scale, args.out_image_count, verbose=True)
+    save_images(images, args.num_images_per_row, subject_name, f"guide{args.guidance_scale}", args.noise_level)

adaface/adaface-translate.py ADDED Viewed

	@@ -0,0 +1,208 @@

+from adaface.adaface_wrapper import AdaFaceWrapper
+import torch
+#import torch.nn.functional as F
+from PIL import Image
+import numpy as np
+import os, argparse, glob, re, shutil
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+def seed_everything(seed):
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    os.environ["PL_GLOBAL_SEED"] = str(seed)
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_model_path", type=str, default='models/realisticvision/realisticVisionV40_v40VAE.safetensors',
+                        help="Path to the UNet checkpoint (default: RealisticVision 4.0)")
+    parser.add_argument("--embman_ckpt", type=str, required=True,
+                        help="Path to the checkpoint of the embedding manager")
+    parser.add_argument("--in_folder",  type=str, required=True, help="Path to the folder containing input images")
+    # If True, the input folder contains images of mixed subjects.
+    # If False, the input folder contains multiple subfolders, each of which contains images of the same subject.
+    parser.add_argument("--is_mix_subj_folder", type=str2bool, const=True, default=False, nargs="?",
+                        help="Whether the input folder contains images of mixed subjects")
+    parser.add_argument("--max_images_per_subject", type=int, default=5, help="Number of example images used per subject")
+    parser.add_argument("--trans_subject_count", type=int, default=-1, help="Number of example images to be translated")
+    parser.add_argument("--out_folder", type=str, required=True, help="Path to the folder saving output images")
+    parser.add_argument("--out_count_per_input_image", type=int, default=1,  help="Number of output images to generate per input image")
+    parser.add_argument("--copy_masks", action="store_true", help="Copy the mask images to the output folder")
+    parser.add_argument("--noise", dest='noise_level', type=float, default=0)
+    parser.add_argument("--scale", dest='guidance_scale', type=float, default=4,
+                        help="Guidance scale for the diffusion model")
+    parser.add_argument("--ref_img_strength", type=float, default=0.8,
+                        help="Strength of the reference image in the output image.")
+    parser.add_argument("--subject_string",
+                        type=str, default="z",
+                        help="Subject placeholder string used in prompts to denote the concept.")
+    parser.add_argument("--num_vectors", type=int, default=16,
+                        help="Number of vectors used to represent the subject.")
+    parser.add_argument("--prompt", type=str, default="a person z")
+    parser.add_argument("--num_images_per_row", type=int, default=4,
+                        help="Number of images to display in a row in the output grid image.")
+    parser.add_argument("--num_inference_steps", type=int, default=50,
+                        help="Number of DDIM inference steps")
+    parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use. If num_gpus > 1, use accelerate for distributed execution.")
+    parser.add_argument("--device", type=str, default="cuda", help="Device to run the model on")
+    parser.add_argument("--seed", type=int, default=42,
+                        help="the seed (for reproducible sampling). Set to -1 to disable.")
+    args = parser.parse_args()
+    return args
+if __name__ == "__main__":
+    args = parse_args()
+    if args.seed != -1:
+        seed_everything(args.seed)
+# screen -dm -L -Logfile trans_rv4-2.txt accelerate launch --multi_gpu --num_processes=2 scripts/adaface-translate.py
+# --embman_ckpt logs/subjects-celebrity2024-05-16T17-22-46_zero3-ada/checkpoints/embeddings_gs-30000.pt
+# --base_model_path models/realisticvision/realisticVisionV40_v40VAE.safetensors --in_folder /data/username/VGGface2_HQ_masks/
+# --is_mix_subj_folder 0 --out_folder /data/username/VGGface2_HQ_masks_rv4a --copy_masks --num_gpus 2
+    if args.num_gpus > 1:
+        from accelerate import PartialState
+        distributed_state = PartialState()
+        args.device = distributed_state.device
+        process_index = distributed_state.process_index
+    elif re.match(r"^\d+$", args.device):
+        args.device = f"cuda:{args.device}"
+        distributed_state = None
+        process_index = 0
+    adaface = AdaFaceWrapper("img2img", args.base_model_path, args.embman_ckpt, args.device,
+                             args.subject_string, args.num_vectors, args.num_inference_steps)
+    in_folder = args.in_folder
+    if os.path.isfile(in_folder):
+        subject_folders = [ os.path.dirname(in_folder) ]
+        images_by_subject = [[in_folder]]
+    else:
+        if not args.is_mix_subj_folder:
+            in_folders = [in_folder]
+        else:
+            in_folders = [ os.path.join(in_folder, subfolder) for subfolder in sorted(os.listdir(in_folder)) ]
+        images_by_subject = []
+        subject_folders   = []
+        for in_folder in in_folders:
+            image_types = ["*.jpg", "*.png", "*.jpeg"]
+            alltype_image_paths = []
+            for image_type in image_types:
+                # glob returns the full path.
+                image_paths = glob.glob(os.path.join(in_folder, image_type))
+                if len(image_paths) > 0:
+                    alltype_image_paths.extend(image_paths)
+            # Filter out images of "*_mask.png"
+            alltype_image_paths = [image_path for image_path in alltype_image_paths if "_mask.png" not in image_path]
+            alltype_image_paths = sorted(alltype_image_paths)
+            if not args.is_mix_subj_folder:
+                # image_paths contain at most args.max_images_per_subject full image paths.
+                if args.max_images_per_subject > 0:
+                    image_paths = alltype_image_paths[:args.max_images_per_subject]
+                else:
+                    image_paths = alltype_image_paths
+                images_by_subject.append(image_paths)
+                subject_folders.append(in_folder)
+            else:
+                # Each image in the folder is treated as an individual subject.
+                images_by_subject.extend([[image_path] for image_path in alltype_image_paths])
+                subject_folders.extend([in_folder] * len(alltype_image_paths))
+            if args.trans_subject_count > 0 and len(subject_folders) >= args.trans_subject_count:
+                break
+    if args.trans_subject_count > 0:
+        images_by_subject = images_by_subject[:args.trans_subject_count]
+        subject_folders   = subject_folders[:args.trans_subject_count]
+    out_image_count = 0
+    out_mask_count  = 0
+    if not args.out_folder.endswith("/"):
+        args.out_folder += "/"
+    if args.num_gpus > 1:
+        # Split the subjects across the GPUs.
+        subject_folders = subject_folders[process_index::args.num_gpus]
+        images_by_subject = images_by_subject[process_index::args.num_gpus]
+        #subject_folders, images_by_subject = distributed_state.split_between_processes(zip(subject_folders, images_by_subject))
+    for (subject_folder, image_paths) in zip(subject_folders, images_by_subject):
+        # If is_mix_subj_folder, then image_paths only contains 1 image, and we use the file name as the signature of the image.
+        # Otherwise, we use the folder name as the signature of the images.
+        images_sig = subject_folder if not args.is_mix_subj_folder else os.path.basename(image_paths[0])
+        print(f"Translating {images_sig}...")
+        with torch.no_grad():
+            adaface_subj_embs = adaface.generate_adaface_embeddings(image_paths, subject_folder, None, False,
+                                                                    out_id_embs_scale=1, noise_level=args.noise_level,
+                                                                    update_text_encoder=True)
+        # Replace the first occurrence of "in_folder" with "out_folder" in the path of the subject_folder.
+        subject_out_folder = subject_folder.replace(args.in_folder, args.out_folder, 1)
+        if not os.path.exists(subject_out_folder):
+            os.makedirs(subject_out_folder)
+        print(f"Output images will be saved to {subject_out_folder}")
+        in_images = []
+        for image_path in image_paths:
+            image = Image.open(image_path).convert("RGB").resize((512, 512))
+            # [512, 512, 3] -> [3, 512, 512].
+            image = np.array(image).transpose(2, 0, 1)
+            # Convert the image to a tensor of shape (1, 3, 512, 512) and move it to the GPU.
+            image = torch.tensor(image).unsqueeze(0).float().cuda()
+            in_images.append(image)
+        # Put all input images of the subject into a batch. This assumes max_images_per_subject is small.
+        # NOTE: For simplicity, we do not check overly large batch sizes.
+        in_images = torch.cat(in_images, dim=0)
+        # in_images: [5, 3, 512, 512].
+        # Normalize the pixel values to [0, 1].
+        in_images = in_images / 255.0
+        num_out_images = len(in_images) * args.out_count_per_input_image
+        with torch.no_grad():
+            # args.noise_level: the *relative* std of the noise added to the face embeddings.
+            # A noise level of 0.08 could change gender, but 0.06 is usually safe.
+            # The returned adaface_subj_embs are already incorporated in the text encoder, and not used explicitly.
+            # NOTE: We assume out_count_per_input_image == 1, so that the output images are of the same number as the input images.
+            out_images = adaface(in_images, args.prompt, args.guidance_scale, num_out_images, ref_img_strength=args.ref_img_strength)
+            for img_i, img in enumerate(out_images):
+                # out_images: subj_1, subj_2, ..., subj_n, subj_1, subj_2, ..., subj_n, ...
+                subj_i = img_i %  len(in_images)
+                copy_i = img_i // len(in_images)
+                image_filename_stem, image_fileext = os.path.splitext(os.path.basename(image_paths[subj_i]))
+                if copy_i == 0:
+                    img.save(os.path.join(subject_out_folder, f"{image_filename_stem}{image_fileext}"))
+                else:
+                    img.save(os.path.join(subject_out_folder, f"{image_filename_stem}_{copy_i}{image_fileext}"))
+                if args.copy_masks:
+                    mask_path = image_paths[subj_i].replace(image_fileext, "_mask.png")
+                    if os.path.exists(mask_path):
+                        if copy_i == 0:
+                            shutil.copy(mask_path, subject_out_folder)
+                        else:
+                            mask_filename_stem = image_filename_stem
+                            shutil.copy(mask_path, os.path.join(subject_out_folder, f"{mask_filename_stem}_{copy_i}_mask.png"))
+                        out_mask_count += 1
+            out_image_count += len(out_images)
+    print(f"{out_image_count} output images and {out_mask_count} masks saved to {args.out_folder}")

adaface/adaface_wrapper.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import torch
+import torch.nn as nn
+from transformers import CLIPTextModel
+from diffusers import (
+    StableDiffusionPipeline,
+    StableDiffusionImg2ImgPipeline,
+    UNet2DConditionModel,
+    DDIMScheduler,
+    AutoencoderKL,
+)
+from insightface.app import FaceAnalysis
+from adaface.arc2face_models import CLIPTextModelWrapper
+from adaface.util import get_arc2face_id_prompt_embs
+import re, os
+import sys
+sys.modules['ldm'] = sys.modules['adaface']
+class AdaFaceWrapper(nn.Module):
+    def __init__(self, pipeline_name, base_model_path, adaface_ckpt_path, device,
+                 subject_string='z', num_vectors=16,
+                 num_inference_steps=50, negative_prompt=None,
+                 use_840k_vae=False, use_ds_text_encoder=False, is_training=False):
+        '''
+        pipeline_name: "text2img" or "img2img" or None. If None, the unet and vae are
+        removed from the pipeline to release RAM.
+        '''
+        super().__init__()
+        self.pipeline_name = pipeline_name
+        self.base_model_path = base_model_path
+        self.adaface_ckpt_path = adaface_ckpt_path
+        self.use_840k_vae = use_840k_vae
+        self.use_ds_text_encoder = use_ds_text_encoder
+        self.subject_string = subject_string
+        self.num_vectors = num_vectors
+        self.num_inference_steps = num_inference_steps
+        self.device = device
+        self.is_training = is_training
+        self.initialize_pipeline()
+        self.extend_tokenizer_and_text_encoder()
+        if negative_prompt is None:
+            self.negative_prompt = \
+            "flaws in the eyes, flaws in the face, lowres, non-HDRi, low quality, worst quality, artifacts, noise, text, watermark, glitch, " \
+            "mutated, ugly, disfigured, hands, partially rendered objects, partially rendered eyes, deformed eyeballs, cross-eyed, blurry, " \
+            "mutation, duplicate, out of frame, cropped, mutilated, bad anatomy, deformed, bad proportions, " \
+            "nude, naked, nsfw, topless, bare breasts"
+        else:
+            self.negative_prompt = negative_prompt
+    def load_subj_basis_generator(self, adaface_ckpt_path):
+        ckpt = torch.load(adaface_ckpt_path, map_location='cpu')
+        string_to_subj_basis_generator_dict = ckpt["string_to_subj_basis_generator_dict"]
+        if self.subject_string not in string_to_subj_basis_generator_dict:
+            print(f"Subject '{self.subject_string}' not found in the embedding manager.")
+            breakpoint()
+        self.subj_basis_generator = string_to_subj_basis_generator_dict[self.subject_string]
+        # In the original ckpt, num_out_layers is 16 for layerwise embeddings.
+        # But we don't do layerwise embeddings here, so we set it to 1.
+        self.subj_basis_generator.num_out_layers = 1
+        print(f"Loaded subject basis generator for '{self.subject_string}'.")
+        print(repr(self.subj_basis_generator))
+        self.subj_basis_generator.to(self.device)
+        if self.is_training:
+            self.subj_basis_generator.train()
+        else:
+            self.subj_basis_generator.eval()
+    def initialize_pipeline(self):
+        self.load_subj_basis_generator(self.adaface_ckpt_path)
+        # arc2face_text_encoder maps the face analysis embedding to 16 face embeddings
+        # in the UNet image space.
+        arc2face_text_encoder = CLIPTextModelWrapper.from_pretrained(
+            'models/arc2face', subfolder="encoder", torch_dtype=torch.float16
+        )
+        self.arc2face_text_encoder = arc2face_text_encoder.to(self.device)
+        if self.use_840k_vae:
+            # The 840000-step vae model is slightly better in face details than the original vae model.
+            # https://huggingface.co/stabilityai/sd-vae-ft-mse-original
+            vae = AutoencoderKL.from_single_file("models/diffusers/sd-vae-ft-mse-original/vae-ft-mse-840000-ema-pruned.ckpt", torch_dtype=torch.float16)
+        else:
+            vae = None
+        if self.use_ds_text_encoder:
+            # The dreamshaper v7 finetuned text encoder follows the prompt slightly better than the original text encoder.
+            # https://huggingface.co/Lykon/DreamShaper/tree/main/text_encoder
+            text_encoder = CLIPTextModel.from_pretrained("models/ds_text_encoder", torch_dtype=torch.float16)
+        else:
+            text_encoder = None
+        remove_unet = False
+        if self.pipeline_name == "img2img":
+            PipelineClass = StableDiffusionImg2ImgPipeline
+        elif self.pipeline_name == "text2img":
+            PipelineClass = StableDiffusionPipeline
+        # pipeline_name is None means only use this instance to generate adaface embeddings, not to generate images.
+        elif self.pipeline_name is None:
+            PipelineClass = StableDiffusionPipeline
+            remove_unet = True
+        else:
+            raise ValueError(f"Unknown pipeline name: {self.pipeline_name}")
+        if os.path.isfile(self.base_model_path):
+            pipeline = PipelineClass.from_single_file(
+                self.base_model_path,
+                torch_dtype=torch.float16
+                )
+        else:
+            pipeline = PipelineClass.from_pretrained(
+                    self.base_model_path,
+                    torch_dtype=torch.float16,
+                    safety_checker=None
+                )
+        print(f"Loaded pipeline from {self.base_model_path}.")
+        if self.use_840k_vae:
+            pipeline.vae = vae
+            print("Replaced the VAE with the 840k-step VAE.")
+        if self.use_ds_text_encoder:
+            pipeline.text_encoder = text_encoder
+            print("Replaced the text encoder with the DreamShaper text encoder.")
+        if remove_unet:
+            # Remove unet and vae to release RAM. Only keep tokenizer and text_encoder.
+            pipeline.unet = None
+            pipeline.vae  = None
+            print("Removed UNet and VAE from the pipeline.")
+        noise_scheduler = DDIMScheduler(
+            num_train_timesteps=1000,
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+            steps_offset=1,
+        )
+        pipeline.scheduler = noise_scheduler
+        self.pipeline = pipeline.to(self.device)
+        # FaceAnalysis will try to find the ckpt in: models/insightface/models/antelopev2.
+        # Note there's a second "model" in the path.
+        self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+        self.face_app.prepare(ctx_id=0, det_size=(512, 512))
+        # Patch the missing tokenizer in the subj_basis_generator.
+        if not hasattr(self.subj_basis_generator, 'clip_tokenizer'):
+            self.subj_basis_generator.clip_tokenizer = self.pipeline.tokenizer
+            print("Patched the missing tokenizer in the subj_basis_generator.")
+    def extend_tokenizer_and_text_encoder(self):
+        if self.num_vectors < 1:
+            raise ValueError(f"num_vectors has to be larger or equal to 1, but is {self.num_vectors}")
+        tokenizer = self.pipeline.tokenizer
+        # Add z0, z1, z2, ..., z15.
+        self.placeholder_tokens = []
+        for i in range(0, self.num_vectors):
+            self.placeholder_tokens.append(f"{self.subject_string}_{i}")
+        self.placeholder_tokens_str = " ".join(self.placeholder_tokens)
+        # Add the new tokens to the tokenizer.
+        num_added_tokens = tokenizer.add_tokens(self.placeholder_tokens)
+        if num_added_tokens != self.num_vectors:
+            raise ValueError(
+                f"The tokenizer already contains the token {self.subject_string}. Please pass a different"
+                " `subject_string` that is not already in the tokenizer.")
+        print(f"Added {num_added_tokens} tokens ({self.placeholder_tokens_str}) to the tokenizer.")
+        # placeholder_token_ids: [49408, ..., 49423].
+        self.placeholder_token_ids = tokenizer.convert_tokens_to_ids(self.placeholder_tokens)
+        # print(self.placeholder_token_ids)
+        # Resize the token embeddings as we are adding new special tokens to the tokenizer
+        old_weight = self.pipeline.text_encoder.get_input_embeddings().weight
+        self.pipeline.text_encoder.resize_token_embeddings(len(tokenizer))
+        new_weight = self.pipeline.text_encoder.get_input_embeddings().weight
+        print(f"Resized text encoder token embeddings from {old_weight.shape} to {new_weight.shape} on {new_weight.device}.")
+    # Extend pipeline.text_encoder with the adaface subject emeddings.
+    # subj_embs: [16, 768].
+    def update_text_encoder_subj_embs(self, subj_embs):
+        # Initialise the newly added placeholder token with the embeddings of the initializer token
+        token_embeds = self.pipeline.text_encoder.get_input_embeddings().weight.data
+        with torch.no_grad():
+            for i, token_id in enumerate(self.placeholder_token_ids):
+                token_embeds[token_id] = subj_embs[i]
+            print(f"Updated {len(self.placeholder_token_ids)} tokens ({self.placeholder_tokens_str}) in the text encoder.")
+    def update_prompt(self, prompt):
+        # If the placeholder tokens are already in the prompt, then return the prompt as is.
+        if self.placeholder_tokens_str in prompt:
+            return prompt
+        # If the subject string 'z' is not in the prompt, then simply prepend the placeholder tokens to the prompt.
+        if re.search(r'\b' + self.subject_string + r'\b', prompt) is None:
+            print(f"Subject string '{self.subject_string}' not found in the prompt. Adding it.")
+            comp_prompt = self.placeholder_tokens_str + " " + prompt
+        else:
+            # Replace the subject string 'z' with the placeholder tokens.
+            comp_prompt = re.sub(r'\b' + self.subject_string + r'\b', self.placeholder_tokens_str, prompt)
+        return comp_prompt
+    # image_paths: a list of image paths. image_folder: the parent folder name.
+    def generate_adaface_embeddings(self, image_paths, image_folder=None,
+                                    pre_face_embs=None, gen_rand_face=False,
+                                    out_id_embs_scale=1., noise_level=0, update_text_encoder=True):
+        # faceid_embeds is a batch of extracted face analysis embeddings (BS * 512 = id_batch_size * 512).
+        # If extract_faceid_embeds is True, faceid_embeds is *the same* embedding repeated by id_batch_size times.
+        # Otherwise, faceid_embeds is a batch of random embeddings, each instance is different.
+        # The same applies to id_prompt_emb.
+        # faceid_embeds is in the face analysis embeddings. id_prompt_emb is in the image prompt space.
+        # Here id_batch_size = 1, so
+        # faceid_embeds: [1, 512]. NOT used later.
+        # id_prompt_emb: [1, 16, 768].
+        # NOTE: Since return_core_id_embs is True, id_prompt_emb is only the 16 core ID embeddings.
+        # arc2face prompt template: "photo of a id person"
+        # ID embeddings start from "id person ...". So there are 3 template tokens before the 16 ID embeddings.
+        face_image_count, faceid_embeds, id_prompt_emb \
+            = get_arc2face_id_prompt_embs(self.face_app, self.pipeline.tokenizer, self.arc2face_text_encoder,
+                                          extract_faceid_embeds=not gen_rand_face,
+                                          pre_face_embs=pre_face_embs,
+                                          # image_folder is passed only for logging purpose.
+                                          # image_paths contains the paths of the images.
+                                          image_folder=image_folder, image_paths=image_paths,
+                                          images_np=None,
+                                          id_batch_size=1,
+                                          device=self.device,
+                                          # input_max_length == 22: only keep the first 22 tokens,
+                                          # including 3 template tokens and 16 ID tokens, and BOS and EOS tokens.
+                                          # The results are indistinguishable from input_max_length=77.
+                                          input_max_length=22,
+                                          noise_level=noise_level,
+                                          return_core_id_embs=True,
+                                          gen_neg_prompt=False,
+                                          verbose=True)
+        if face_image_count == 0:
+            return None
+        # adaface_subj_embs: [1, 1, 16, 768].
+        # adaface_prompt_embs: [1, 77, 768] (not used).
+        adaface_subj_embs, adaface_prompt_embs = \
+            self.subj_basis_generator(id_prompt_emb, None, None,
+                                      out_id_embs_scale=out_id_embs_scale,
+                                      is_face=True, is_training=False,
+                                      adaface_prompt_embs_inf_type='full_half_pad')
+        # adaface_subj_embs: [16, 768]
+        adaface_subj_embs = adaface_subj_embs.squeeze()
+        if update_text_encoder:
+            self.update_text_encoder_subj_embs(adaface_subj_embs)
+        return adaface_subj_embs
+    def encode_prompt(self, prompt, negative_prompt=None, device="cuda", verbose=False):
+        if negative_prompt is None:
+            negative_prompt = self.negative_prompt
+        prompt = self.update_prompt(prompt)
+        if verbose:
+            print(f"Prompt: {prompt}")
+        # For some unknown reason, the text_encoder is still on CPU after self.pipeline.to(self.device).
+        # So we manually move it to GPU here.
+        self.pipeline.text_encoder.to(device)
+        # prompt_embeds_, negative_prompt_embeds_: [1, 77, 768]
+        prompt_embeds_, negative_prompt_embeds_ = \
+            self.pipeline.encode_prompt(prompt, device=device, num_images_per_prompt=1,
+                                        do_classifier_free_guidance=True, negative_prompt=negative_prompt)
+        return prompt_embeds_, negative_prompt_embeds_
+    # ref_img_strength is used only in the img2img pipeline.
+    def forward(self, noise, prompt, negative_prompt=None, guidance_scale=4.0,
+                out_image_count=4, ref_img_strength=0.8, generator=None, verbose=False):
+        if negative_prompt is None:
+            negative_prompt = self.negative_prompt
+        # prompt_embeds_, negative_prompt_embeds_: [1, 77, 768]
+        prompt_embeds_, negative_prompt_embeds_ = self.encode_prompt(prompt, negative_prompt, device=self.device, verbose=verbose)
+        # Repeat the prompt embeddings for all images in the batch.
+        prompt_embeds_          = prompt_embeds_.repeat(out_image_count, 1, 1)
+        negative_prompt_embeds_ = negative_prompt_embeds_.repeat(out_image_count, 1, 1)
+        noise = noise.to(self.device).to(torch.float16)
+        # noise: [BS, 4, 64, 64]
+        # When the pipeline is text2img, strength is ignored.
+        images = self.pipeline(image=noise,
+                               prompt_embeds=prompt_embeds_,
+                               negative_prompt_embeds=negative_prompt_embeds_,
+                               num_inference_steps=self.num_inference_steps,
+                               guidance_scale=guidance_scale,
+                               num_images_per_prompt=1,
+                               strength=ref_img_strength,
+                               generator=generator).images
+        # images: [BS, 3, 512, 512]
+        return images

adaface/arc2face_models.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import torch
+import torch.nn as nn
+from transformers import CLIPTextModel
+from transformers.models.clip.modeling_clip import CLIPAttention
+from typing import Any, Callable, Dict, Optional, Tuple, Union, List
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+# from transformers.models.clip.modeling_clip import _make_causal_mask, _expand_mask
+_make_causal_mask = AttentionMaskConverter._make_causal_mask
+_expand_mask = AttentionMaskConverter._expand_mask
+from adaface.util import add_noise_to_tensor
+# Extend CLIPAttention by using multiple k_proj and v_proj in each head.
+# To avoid too much increase of computation, we don't extend q_proj.
+class CLIPAttentionMKV(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config, multiplier=2):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.multiplier  = multiplier
+        self.k_proj   = nn.Linear(self.embed_dim, self.embed_dim * self.multiplier)
+        self.v_proj   = nn.Linear(self.embed_dim, self.embed_dim * self.multiplier)
+        self.q_proj   = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+    # The (approximately) repeated token features are repeated along the last dim in tensor
+    # (multiplier * num_heads * head_dim), and then reshaped to (bsz, -1, num_heads, head_dim).
+    # Therefore, the "multiplier" dim is tucked into the seq_len dim, which looks like
+    # [token1_emb, token1_emb, token2_emb, token2_emb, ..., tokenN_emb, tokenN_emb].
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def extend_weights(self, clip_attn_layer, layer_idx, multiplier, noise_std=0.1,
+                       noise_std_is_relative=True, keep_norm=False, verbose=False):
+        self.multiplier *= multiplier
+        # q_proj and out_proj are the same as the original CLIPAttention.
+        self.q_proj.weight.data   = clip_attn_layer.q_proj.weight.data.clone()
+        self.q_proj.bias.data     = clip_attn_layer.q_proj.bias.data.clone()
+        self.out_proj.weight.data = clip_attn_layer.out_proj.weight.data.clone()
+        self.out_proj.bias.data   = clip_attn_layer.out_proj.bias.data.clone()
+        # bias doesn't need noise perturbation, as after the weights are noised,
+        # different copies of the weight/bias will receive different gradients,
+        # making the bias terms diverge and identifiable after training.
+        self.v_proj.bias.data     = clip_attn_layer.v_proj.bias.data.repeat(multiplier)
+        self.k_proj.bias.data     = clip_attn_layer.k_proj.bias.data.repeat(multiplier)
+        self.v_proj.weight.data   = clip_attn_layer.v_proj.weight.data.repeat(multiplier, 1)
+        self.k_proj.weight.data   = clip_attn_layer.k_proj.weight.data.repeat(multiplier, 1)
+        if noise_std > 0:
+            ORIG_V_SHAPE    = list(clip_attn_layer.v_proj.weight.shape)
+            ORIG_V_SHAPE_D0 = ORIG_V_SHAPE[0]
+            # Adding noise to the extra copies of the weights (keep the first copy unchanged).
+            self.v_proj.weight.data[ORIG_V_SHAPE_D0:] = \
+                add_noise_to_tensor(self.v_proj.weight.data[ORIG_V_SHAPE_D0:],
+                                    noise_std, noise_std_is_relative, keep_norm)
+            if verbose:
+                NEW_V_SHAPE     = list(self.v_proj.weight.shape)
+                NOISED_V_SHAPE  = list(self.v_proj.weight.data[ORIG_V_SHAPE_D0:].shape)
+                print(f"Layer {layer_idx}: {NOISED_V_SHAPE} in {NEW_V_SHAPE} of v_proj is added with {noise_std} noise")
+            ORIG_K_SHAPE    = list(clip_attn_layer.k_proj.weight.shape)
+            ORIG_K_SHAPE_D0 = ORIG_K_SHAPE[0]
+            # Adding noise to the extra copies of the weights.
+            self.k_proj.weight.data[ORIG_K_SHAPE_D0:] = \
+                add_noise_to_tensor(self.k_proj.weight.data[ORIG_K_SHAPE_D0:],
+                                    noise_std, noise_std_is_relative, keep_norm)
+            if verbose:
+                NEW_K_SHAPE     = list(self.k_proj.weight.shape)
+                NOISED_K_SHAPE  = list(self.k_proj.weight.data[ORIG_K_SHAPE_D0:].shape)
+                print(f"Layer {layer_idx}: {NOISED_K_SHAPE} in {NEW_K_SHAPE} of k_proj is added with {noise_std} noise")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        query_states = self.q_proj(hidden_states) * self.scale
+        # For key_states and value_states, the multiplier is absorbed into the seq_len (dim 1, shape specified as -1).
+        # [token0_head_emb, token0_head_emb, token1_head_emb, token1_head_emb, ..., tokenN-1_head_emb, tokenN-1_head_emb].
+        key_states   = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states   = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+        src_len = key_states.size(1)
+        # src_len0 is the original src_len without the multiplier.
+        src_len0 = src_len // self.multiplier
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len0):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len0)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            # The last dim of attn_weights corresponds to [token0, token0, token1, token1, ..., tokenN-1, tokenN-1].
+            # If reshaping it as (self.multiplier, src_len0), it will become
+            # [[token0, token0, token1, token1, ..., tokenN//2], [tokenN//2+1, tokenN//2+1, ..., tokenN-1, tokenN-1]],
+            # and the mask will be applied to wrong elements.
+            # If reshaping it as (src_len0, self.multiplier), it will become
+            # [[token0, token1, ..., tokenN-1], [token0, token1, ..., tokenN-1]], and then
+            # the mask at element i will mask all the multiplier elements at i, which is desired.
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len0, self.multiplier) + causal_attention_mask.unsqueeze(4)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len0):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len0)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len0, self.multiplier) + attention_mask.unsqueeze(4)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped
+class CLIPTextModelWrapper(CLIPTextModel):
+    # Adapted from https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/clip/modeling_clip.py#L812
+    # Modified to accept precomputed token embeddings "input_token_embs" as input or calculate them from input_ids and return them.
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        input_token_embs: Optional[torch.Tensor] = None,
+        hidden_state_layer_weights: Optional[torch.Tensor] = None,
+        return_token_embs: Optional[bool] = False,
+    ) -> Union[Tuple, torch.Tensor, BaseModelOutputWithPooling]:
+        if return_token_embs:
+            return self.text_model.embeddings.token_embedding(input_ids)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.text_model.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.text_model.config.output_hidden_states
+        )
+        if hidden_state_layer_weights is not None:
+            output_hidden_states = True
+        return_dict = return_dict if return_dict is not None else self.text_model.config.use_return_dict
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        hidden_states = self.text_model.embeddings(input_ids=input_ids, position_ids=position_ids, inputs_embeds=input_token_embs)
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+        encoder_outputs = self.text_model.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            # output_hidden_states is False by default, and only True if hidden_state_layer_weights is provided.
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # If output_hidden_states is True, then encoder_outputs[0] is last_hidden_state [1, 22, 768].
+        # encoder_outputs[1] is hidden_states, which is a tuple of 13 hidden states, each being [1, 22, 768].
+        # encoder_outputs[0] == encoder_outputs[1][12].
+        if hidden_state_layer_weights is None:
+            last_hidden_state = encoder_outputs[0]
+        else:
+            num_hidden_state_layers = len(hidden_state_layer_weights)
+            last_hidden_states = encoder_outputs[1][-num_hidden_state_layers:]
+            hidden_state_layer_weights = hidden_state_layer_weights.to(last_hidden_states[0].dtype)
+            # Normalize the weights of to sum to 1 across layers.
+            # hidden_state_layer_weights: [3, 1] or [3, 768].
+            hidden_state_layer_weights = hidden_state_layer_weights / hidden_state_layer_weights.sum(dim=0, keepdim=True)
+            # [3, 1/768] -> [3, 1, 1, 1/768]
+            hidden_state_layer_weights = hidden_state_layer_weights.unsqueeze(1).unsqueeze(1)
+            # A weighted sum of last_hidden_states.
+            # [3, 1, 22, 768] * [3, 1, 1, 1/768] -> [3, 1, 22, 768] -> [1, 22, 768]
+            last_hidden_state = (torch.stack(last_hidden_states, dim=0) * hidden_state_layer_weights).sum(dim=0)
+        last_hidden_state = self.text_model.final_layer_norm(last_hidden_state)
+        # self.text_model.eos_token_id == 2 is True.
+        if self.text_model.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+            ]
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
+                (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.text_model.eos_token_id)
+                .int()
+                .argmax(dim=-1),
+            ]
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+    # Applied to layers [begin_layer_idx, end_layer_idx) in the encoder.
+    # The layer indexed by end_layer_idx is not included.
+    # If both layer indices are -1, then apply to all layers (0-11).
+    def extend_clip_attention_MKV_multiplier(self, begin_layer_idx=-1, end_layer_idx=-1, multiplier=2, noise_std=0.1):
+        num_extended_layers = 0
+        for layer_idx, layer in enumerate(self.text_model.encoder.layers):
+            if begin_layer_idx >= 0 and layer_idx < begin_layer_idx:
+                continue
+            if end_layer_idx >= 0 and layer_idx >= end_layer_idx:
+                break
+            # This shouldn't happen, unless self_attn has already been extended as CLIPAttentionMKV.
+            if not isinstance(layer.self_attn, (CLIPAttention, CLIPAttentionMKV)):
+                breakpoint()
+            old_attn_layer = layer.self_attn
+            if not isinstance(old_attn_layer, CLIPAttentionMKV):
+                layer.self_attn = CLIPAttentionMKV(old_attn_layer.config, 1)
+            layer.self_attn.extend_weights(old_attn_layer, layer_idx, multiplier, noise_std, verbose=True)
+            num_extended_layers += 1
+        return num_extended_layers

adaface/subj_basis_generator.py ADDED Viewed

	@@ -0,0 +1,758 @@

+# Borrowed from ip-adapter resampler.py.
+# https://github.com/tencent-ailab/IP-Adapter/blob/main/ip_adapter/resampler.py
+# modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
+# and https://github.com/lucidrains/imagen-pytorch/blob/main/imagen_pytorch/imagen_pytorch.py
+import math
+import torch
+from torch import nn
+import torch.nn.functional as F
+from einops import rearrange
+from einops.layers.torch import Rearrange
+from transformers import CLIPVisionModel, CLIPTokenizer
+import numpy as np
+from torch import einsum
+from dataclasses import dataclass
+from typing import Optional, Tuple
+from transformers.utils import ModelOutput
+from adaface.util import arc2face_inverse_face_prompt_embs, gen_gradient_scaler
+from adaface.arc2face_models import CLIPTextModelWrapper
+import sys
+sys.modules['ldm'] = sys.modules['adaface']
+def reshape_tensor(x, num_heads):
+    bs, length, width = x.shape
+    # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, num_heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+    x = x.reshape(bs, num_heads, length, -1)
+    return x
+# FFN. Added a Dropout layer at the end, so that it can still load the old ckpt.
+def FeedForward(dim, mult=4, p_dropout=0.1):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+        nn.Dropout(p_dropout),
+    )
+# IP-Adapter FaceID class. Only used in knn-faces.py.
+# From: https://github.com/tencent-ailab/IP-Adapter/blob/main/ip_adapter/ip_adapter_faceid_separate.py
+class IP_MLPProjModel(nn.Module):
+    def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, num_tokens=4):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.num_tokens = num_tokens
+        self.proj = nn.Sequential(
+            nn.Linear(id_embeddings_dim, id_embeddings_dim*2),
+            nn.GELU(),
+            nn.Linear(id_embeddings_dim*2, cross_attention_dim*num_tokens),
+        )
+        self.norm = nn.LayerNorm(cross_attention_dim)
+    def forward(self, id_embeds):
+        x = self.proj(id_embeds)
+        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
+        x = self.norm(x)
+        return x
+# group_dim: the tensor dimension that corresponds to the multiple groups.
+class LearnedSoftAggregate(nn.Module):
+    def __init__(self, num_feat, group_dim, keepdim=False):
+        super(LearnedSoftAggregate, self).__init__()
+        self.group_dim  = group_dim
+        # num_feat = 1: element-wise score function & softmax.
+        # num_feat > 1: the linear score function is applied to the last dim (features) of the input tensor.
+        self.num_feat   = num_feat
+        self.feat2score = nn.Linear(num_feat, 1, bias=False)
+        self.keepdim    = keepdim
+    def forward(self, x, score_basis=None):
+        # If there's only one mode, do nothing.
+        if x.shape[self.group_dim] == 1:
+            if self.keepdim:
+                return x
+            else:
+                return x.squeeze(self.group_dim)
+        # Assume the last dim of x is the feature dim.
+        if score_basis is None:
+            score_basis = x
+        if self.num_feat == 1:
+            mode_scores = self.feat2score(score_basis.unsqueeze(-1)).squeeze(-1)
+        else:
+            mode_scores = self.feat2score(score_basis)
+        attn_probs  = mode_scores.softmax(dim=self.group_dim)
+        x_aggr      = (x * attn_probs).sum(dim=self.group_dim, keepdim=self.keepdim)
+        return x_aggr
+def LoRA_ExpandEmbs(input_dim, lora_rank, output_dim, num_modes,
+                    num_output_vecs, elementwise_affine=True, p_dropout=0.1):
+    return nn.Sequential(
+        # Project to [BS, lora_rank * output_dim * num_modes].
+        # It takes a huge param size. 512 * 32 * 768 * 4 = 6,291,456.
+        nn.Linear(input_dim, lora_rank * output_dim * num_modes, bias=False),
+        # Reshape to [BS, lora_rank, output_dim].
+        Rearrange('b (m q d) -> b m q d', q=lora_rank, m=num_modes, d=output_dim),
+        nn.LayerNorm(output_dim, elementwise_affine=elementwise_affine),
+        # Aggregate [BS, num_modes, loar_rank, output_dim] -> [BS, lora_rank, output_dim].
+        LearnedSoftAggregate(num_feat=output_dim, group_dim=1, keepdim=False) if num_modes > 1 \
+            else Rearrange('b () q d -> b q d'),
+        nn.Dropout(p_dropout),
+        # Permute to [BS, output_dim, lora_rank].
+        Rearrange('b q d -> b d q'),
+        # Project to [BS, output_dim, num_output_vecs].
+        nn.Linear(lora_rank, num_output_vecs, bias=False),
+        # Permute to [BS, num_output_vecs, output_dim].
+        Rearrange('b d q -> b q d'),
+        nn.LayerNorm(output_dim, elementwise_affine=elementwise_affine),
+        nn.Dropout(p_dropout),
+    )
+def ExpandEmbs(input_dim, output_dim, expansion_ratio, elementwise_affine=True, p_dropout=0.1):
+    return nn.Sequential(
+        # Project to [BS, num_output_vecs * output_dim].
+        nn.Linear(input_dim, expansion_ratio * output_dim, bias=False),
+        # Reshape to [BS, num_output_vecs, output_dim].
+        Rearrange('b (e d) -> b e d', e=expansion_ratio, d=output_dim),
+        nn.LayerNorm(output_dim, elementwise_affine=elementwise_affine),
+        nn.Dropout(p_dropout),
+    )
+# Input: [BS, N, D].
+def MultimodeProjection(input_dim, output_dim=-1, num_modes=4, elementwise_affine=True, p_dropout=0.1):
+    if output_dim == -1:
+        output_dim = input_dim
+    return nn.Sequential(
+            nn.Linear(input_dim, output_dim * num_modes, bias=False),
+            # Reshape to [BS, num_output_vecs, output_dim].
+            Rearrange('b n (m d) -> b n m d', m=num_modes, d=output_dim),
+            nn.LayerNorm(output_dim, elementwise_affine=elementwise_affine),
+            # If num_modes == 1, then simply remove the mode dim. Otherwise, aggregate the modes.
+            LearnedSoftAggregate(num_feat=output_dim, group_dim=2, keepdim=False) if num_modes > 1 \
+                else Rearrange('b n () d -> b n d'),
+            nn.Dropout(p_dropout),
+    )
+# Low-rank to high-rank transformation.
+def Lora2Hira(lora_rank, hira_rank, output_dim, num_modes, elementwise_affine=True, p_dropout=0.1):
+    return nn.Sequential(
+        # Permute to [BS, output_dim, lora_rank].
+        Rearrange('b q d -> b d q'),
+        # Project to [BS, output_dim, hira_rank].
+        nn.Linear(lora_rank, hira_rank * num_modes, bias=False),
+        # Reshape and permute to [BS, num_modes, num_output_vecs, output_dim].
+        Rearrange('b d (m q) -> b m q d', m=num_modes, q=hira_rank),
+        nn.LayerNorm(output_dim, elementwise_affine=elementwise_affine),
+        # Aggregate [BS, num_modes, hira_rank, output_dim] -> [BS, hira_rank, output_dim].
+        LearnedSoftAggregate(num_feat=output_dim, group_dim=1, keepdim=False) if num_modes > 1 \
+            else Rearrange('b () q d -> b q d'),
+        nn.Dropout(p_dropout),
+    )
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, num_heads=8, elementwise_affine=True):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.dim_head = dim_head
+        self.num_heads = num_heads
+        inner_dim = dim_head * num_heads
+        self.norm1 = nn.LayerNorm(dim, elementwise_affine=elementwise_affine)
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=elementwise_affine)
+        self.to_q   = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv  = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, x, latent_queries):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latent_queries = self.norm2(latent_queries)
+        b, l, _ = latent_queries.shape
+        q = self.to_q(latent_queries)
+        kv_input = torch.cat((x, latent_queries), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q = reshape_tensor(q, self.num_heads)
+        k = reshape_tensor(k, self.num_heads)
+        v = reshape_tensor(v, self.num_heads)
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
+        attn = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = attn @ v
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+        return self.to_out(out)
+class CrossAttention(nn.Module):
+    # output_dim is always the same as input_dim.
+    # num_q only matters when q_aware_to_v is True.
+    # If q_aware_to_v is False, query x in forward() is still usable.
+    def __init__(self, input_dim, num_heads=6, p_dropout=0.05,
+                 identity_to_q=False, identity_to_k=False, identity_to_v=False, v_has_skip=True,
+                 q_aware_to_v=True, num_q=416, v_repeat=4, q_aware_to_v_lora_rank=64,
+                 identity_to_out=False, out_has_skip=False):
+        super().__init__()
+        dim_head  = input_dim // num_heads
+        inner_dim = dim_head   * num_heads
+        self.num_heads      = num_heads
+        self.q_aware_to_v   = q_aware_to_v
+        self.v_has_skip     = v_has_skip
+        self.to_q = nn.Sequential(
+                        nn.Linear(input_dim, inner_dim, bias=False),
+                        nn.LayerNorm(inner_dim, elementwise_affine=True)
+                    ) if not identity_to_q else nn.Identity()
+        self.to_k = nn.Sequential(
+                        nn.Linear(input_dim, inner_dim, bias=False),
+                        nn.LayerNorm(inner_dim, elementwise_affine=True)
+                    ) if not identity_to_k else nn.Identity()
+        self.v_repeat = v_repeat
+        self.num_q_group = num_q_group = num_q // v_repeat      # 416 / 4 = 104.
+        # If q_aware_to_v is True, then self.to_v consists of num_q projections of input_dim to inner_dim.
+        # Otherwise, self.to_v consists of a single projection of input_dim to inner_dim.
+        if q_aware_to_v:
+            # all_q_mid: 104 * 64 = 6656.
+            all_q_mid = num_q_group * q_aware_to_v_lora_rank
+            self.to_v = nn.Sequential(
+                # number of params: 768 * 6656 = 5,111,808.
+                # Input:  [BS, 16, 768]. Output: [BS, 16, 104*64] = [BS, 16, 6656].
+                # Each 768-dim vec is dispersed into 104 64-dim vecs.
+                nn.Linear(input_dim, all_q_mid, bias=False),
+                nn.LayerNorm(all_q_mid, elementwise_affine=True),
+                # Change the dim of the tensor to [BS, 6656, 16], as Conv1d transforms dim 1.
+                Rearrange('b n q -> b q n', q=all_q_mid),
+                # Each q_aware_to_v projection has its own linear layer.
+                # The total number of parameters will be 6656*768 = 5,111,808.
+                # Output: [BS, 104*768, 16]. Each 64 dim feature is expanded to 768 dim.
+                nn.Conv1d(
+                    in_channels=all_q_mid,
+                    out_channels=num_q_group * input_dim,
+                    kernel_size=1,
+                    groups=num_q_group,
+                    bias=False,
+                ),
+                # Output: [BS, 104, 16, 768].
+                Rearrange('b (q d) n -> b q n d', q=num_q_group, d=input_dim),
+                nn.LayerNorm(input_dim, elementwise_affine=True),
+            )
+        else:
+            self.to_v = nn.Sequential(
+                            nn.Linear(input_dim, inner_dim, bias=False),
+                            nn.LayerNorm(inner_dim, elementwise_affine=True)
+                        ) if not identity_to_v else nn.Identity()
+        if identity_to_out:
+            assert not out_has_skip, "identity_to_out=True, then out_has_skip has to be False."
+        if identity_to_out:
+            self.to_out = nn.Identity()
+        else:
+            self.to_out = nn.Sequential(
+                nn.Linear(input_dim, input_dim, bias=False),
+                nn.Dropout(p_dropout),
+                nn.LayerNorm(inner_dim, elementwise_affine=True)
+            )
+        self.out_has_skip = out_has_skip
+        self.attn_drop = nn.Dropout(p_dropout)
+    def forward(self, x, context=None, attn_mat=None, return_attn=False):
+        h = self.num_heads
+        if context is None:
+            context = x
+        if attn_mat is None:
+            # q: [BS, Q, D] -> [BS, Q, D].
+            q = self.to_q(x)
+            # k: [BS, L, D] -> [BS, L, D].
+            k = self.to_k(context)
+            # q: [6, 512, 128], k: [6, 17, 128].
+            q, k = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k))
+        if self.q_aware_to_v:
+            # context: [BS, L, D]. v: [BS, Q, L, D].
+            # There are effectively Q to_v projections.
+            v = self.to_v(context)
+            if self.v_has_skip:
+                v = v + context.unsqueeze(1)
+        else:
+            # v: [BS, L, D].
+            v = self.to_v(context)
+            if self.v_has_skip:
+                v = v + context
+        #print(v.shape)
+        if self.q_aware_to_v:
+            # v: [6, 64, 17, 128].
+            # v is query-specific, so there's an extra dim for the query.
+            v = rearrange(v, 'b q n (h d) -> (b h) q n d', h=h)
+            # Each v is for a query group with 512/64 = 8 queries.
+            # So each v is repeated 8 times to match the number of queries.
+            # v: [6, 64, 17, 128] -> [6, 512, 17, 128].
+            v = v.repeat(1, self.v_repeat, 1, 1)
+        else:
+            v = rearrange(v, 'b n (h d) -> (b h) n d', h=h)
+        if attn_mat is None:
+            scale = q.size(-1) ** -0.25
+            sim = einsum('b i d, b j d -> b i j', q * scale, k * scale)
+            # sim: [6, 64, 17]. 6: bs 1 * h 6.
+            # attention, what we cannot get enough of
+            # NOTE: the normalization is done across tokens, not across pixels.
+            # So for each pixel, the sum of attention scores across tokens is 1.
+            attn = sim.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            #print(attn.std())
+        else:
+            attn = attn_mat
+        if self.q_aware_to_v:
+            # attn: [6, 32, 17]. v: [6, 32, 17, 128]. 128: dim of each head. out: [6, 32, 128].
+            # out is combined with different attn weights and v for different queries.
+            out = einsum('b i j, b i j d -> b i d', attn, v)
+        else:
+            # v: [6, 17, 128]. out: [6, 32, 128].
+            out = einsum('b i j, b j d -> b i d',   attn, v)
+        # [6, 32, 128] -> [1, 32, 768].
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+        if self.out_has_skip:
+            out = self.to_out(out) + out
+        else:
+            out = self.to_out(out)
+        if return_attn:
+            return out, attn
+        else:
+            return out
+class SubjBasisGenerator(nn.Module):
+    def __init__(
+        self,
+        # number of cross-attention heads. Half of the number of heads 12 of OpenAI clip-vit-large-patch14:
+        # https://huggingface.co/openai/clip-vit-large-patch14/blob/main/config.json
+        num_heads=6,
+        num_id_vecs={ 'subj': 77, 'bg': 257 }, # number of identity vectors. 18: 16 face tokens + 2 extra tokens. 257: 257 CLIP tokens.
+        num_out_embs_per_layer=4,             # num_out_embs. subj: 16. bg: 4.
+        num_out_layers=16,                    # number of layers of output embeddings.
+        image_embedding_dim=768,              # CLIP image feature dimension, as per config.json above.
+        # DINO vits16 has 6 attention heads:
+        # https://huggingface.co/facebook/dino-vits16/blob/main/config.json
+        dino_embedding_dim=384,             # DINO object feature dimension for objects.
+        output_dim=768,                     # CLIP text embedding input dimension.
+        placeholder_is_bg: bool = False,    # Whether the placeholder is for the image background.
+        prompt2token_proj_grad_scale: float = 0.4,  # Gradient scale for prompt2token_proj.
+        zs_extra_words_scale: float = 0.5,     # Scale for extra words in the prompt2token_proj.
+        learnable_hidden_state_weights_scheme: str = 'per-layer',  # none, per-layer.
+        bg_prompt_translator_has_to_out_proj: bool = False,  # Whether the prompt_trans_layers have a to_out projection.
+    ):
+        super().__init__()
+        self.placeholder_is_bg      = placeholder_is_bg
+        self.num_out_layers         = num_out_layers
+        self.num_out_embs_per_layer = num_out_embs_per_layer
+        # subj: 64, bg: 32.
+        self.num_out_embs           = num_out_layers * num_out_embs_per_layer
+        self.output_dim             = output_dim
+        # num_id_vecs should be the number of core ID embs, 16.
+        # However, in such case, pos_embs is not used. So it doesn't matter if it's wrongly set.
+        self.num_id_vecs = num_id_vecs['bg'] if placeholder_is_bg else num_id_vecs['subj']
+        self.pos_embs    = nn.Parameter(torch.randn(1, self.num_id_vecs, output_dim))
+        self.pos_embs_ln = nn.LayerNorm(output_dim)
+        self.zs_extra_words_scale = zs_extra_words_scale
+        self.output_scale           = output_dim ** -0.5
+        self.clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        if not self.placeholder_is_bg:
+            # [1, 384] -> [1, 16, 768].
+            # TODO: use CLIPTextModelWrapper as obj_proj_in.
+            self.obj_proj_in = ExpandEmbs(dino_embedding_dim, output_dim, expansion_ratio=self.num_id_vecs)
+            # self.prompt2token_proj: [1, 16, 768] -> [1, 77, 768] (with paddings).
+            # If self.placeholder_is_bg: prompt2token_proj is set to None.
+            self.prompt2token_proj  = CLIPTextModelWrapper.from_pretrained('openai/clip-vit-large-patch14')
+            self.prompt2token_proj_grad_scale = prompt2token_proj_grad_scale
+            self.prompt2token_proj_grad_scaler = gen_gradient_scaler(prompt2token_proj_grad_scale)
+            print(f"Subj prompt2token_proj initialized with grad scale of {prompt2token_proj_grad_scale}.")
+            # Freeze prompt2token_proj if prompt2token_proj_grad_scale is 0.
+            # Set requires_grad to False for all parameters in prompt2token_proj, to save memory taken by the optimizer.
+            if prompt2token_proj_grad_scale == 0:
+                self.freeze_prompt2token_proj()
+            self.prompt2token_proj_attention_multiplier = -1
+            self.initialize_hidden_state_layer_weights(learnable_hidden_state_weights_scheme, 'cpu')
+            self.pad_embeddings = None
+            self.bg_proj_in = None
+        else:
+            # For background placeholders, face and object embeddings are not used as they are foreground.
+            self.obj_proj_in  = None
+            self.prompt2token_proj = None
+            print("Bg prompt2token_proj is set to None.")
+            self.bg_proj_in = nn.Sequential(
+                nn.Linear(image_embedding_dim, output_dim, bias=False),
+                nn.LayerNorm(output_dim),
+            )
+            self.latent_queries     = nn.Parameter(torch.randn(1, self.num_out_embs, output_dim))
+            self.latent_queries_ln  = nn.LayerNorm(output_dim)
+            self.bg_prompt_translator_has_to_out_proj = bg_prompt_translator_has_to_out_proj
+            identity_to_v   = False
+            v_has_skip      = not identity_to_v                         # True
+            identity_to_out = not bg_prompt_translator_has_to_out_proj  # True
+            out_has_skip    = not identity_to_out                       # False
+            # prompt_translator has a to_v projection with skip connection, and doesn't have a to_out projection.
+            # dim=768, num_heads=6.
+            self.prompt_translator = \
+                CrossAttention(input_dim=output_dim, num_heads=num_heads, p_dropout=0.05,
+                                identity_to_q=False, identity_to_k=False, identity_to_v=identity_to_v,
+                                q_aware_to_v=False,  v_has_skip=v_has_skip,
+                                num_q=0, # When not q_aware_to_v, num_q is not referenced.
+                                identity_to_out=identity_to_out,
+                                out_has_skip=out_has_skip)
+            '''
+            prompt_translator: CLIPEncoder
+            # https://github.com/huggingface/transformers/blob/1872bde7fc6a5d6796bd742bc2dc38eaf8069c5d/src/transformers/models/clip/modeling_clip.py#L566
+            # CLIPEncoder.layers: 12 layers of CLIPEncoderLayer, each being
+                (0): CLIPEncoderLayer(
+                    (self_attn): CLIPAttention(
+                        (k_proj): Linear(in_features=768, out_features=768, bias=True)
+                        (v_proj): Linear(in_features=768, out_features=768, bias=True)
+                        (q_proj): Linear(in_features=768, out_features=768, bias=True)
+                        (out_proj): Linear(in_features=768, out_features=768, bias=True)
+                    )
+                    (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                    (mlp): CLIPMLP(
+                        (activation_fn): QuickGELUActivation()
+                        (fc1): Linear(in_features=768, out_features=3072, bias=True)
+                        (fc2): Linear(in_features=3072, out_features=768, bias=True)
+                    )
+                    (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                )
+            '''
+        print(repr(self))
+    # raw_id_embs: ArcFace embeddings for faces (not used since we have arc2face_id_embs),
+    # or DINO embeddings for objects.
+    # arc2face_id_embs: [BS, 16, 768], the core identity embeddings generated by Arc2Face.
+    def forward(self, arc2face_id_embs, clip_features=None, raw_id_embs=None, out_id_embs_scale=1.0,
+                is_face=True, is_training=False, adaface_prompt_embs_inf_type='full_half_pad'):
+        if not self.placeholder_is_bg:
+            BS = arc2face_id_embs.shape[0]
+        else:
+            # If bg, then arc2face_id_embs is set to None, but clip_features is not None.
+            BS = clip_features.shape[0]
+        adaface_prompt_embs = None
+        if not hasattr(self, 'clip_tokenizer'):
+            self.clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        # No need to use raw_id_embs if placeholder_is_bg.
+        if not self.placeholder_is_bg:
+            if is_face:
+                assert arc2face_id_embs is not None
+                # arc2face_embs has been projected to the (modified) prompt embedding space
+                # by arc2face_forward_face_embs. This prompt embedding space is modified because Arc2Face finetuned
+                # the text encoder and the U-Net.
+                # in embedding_manager: [BS, 16, 768] -> [BS, 77, 768].
+                # arc2face_id_embs is part of arc2face_embs: [BS, 77, 768] -> [BS, 16, 768].
+                # adaface_prompt_embs is projected to the prompt embedding spaces. This is the
+                # original U-Net prompt embedding space.
+                # hidden_state_layer_weights: [[0.9163], [0.9483], [2.0762]]
+                hidden_state_layer_weights = self.hidden_state_layer_weights_grad_scaler(self.hidden_state_layer_weights)
+                # return_emb_types: a list of strings, each string is among
+                # ['full', 'core', 'full_pad', 'full_half_pad', 'full_zeroed_extra', 'b_core_e'].
+                # Using b_core_e is more computationally efficient than using full_zeroed_extra.
+                # But there is an unknow BUG that causes crash when using b_core_e.
+                if is_training:
+                    return_emb_types = ['full_pad', 'core']
+                else:
+                    # adaface_prompt_embs_inf_type: default is full_half_pad, same as training.
+                    return_emb_types = [adaface_prompt_embs_inf_type, 'core']
+                if self.pad_embeddings is None:
+                    self.generate_pad_embeddings()
+                else:
+                    self.pad_embeddings = self.pad_embeddings.to(arc2face_id_embs.device)
+                with torch.set_grad_enabled(self.training and self.prompt2token_proj_grad_scale != 0):
+                    # If list_extra_words is not None, then core_id_embs: [BS, 18, 768], three leading words, the 16 identity tokens
+                    # and (at most) two extra words in full_prompt_embs, without BOS and EOS.
+                    # If list_extra_words is None, then core_id_embs: [BS, 16, 768], the 16 identity tokens in full_prompt_embs.
+                    # hidden_state_layer_weights: [[0.9163], [0.9483], [2.0762]]
+                    # zs_extra_words_scale is only effective when list_extra_words is not None.
+                    # adaface_prompt_embs: [BS, 77, 768], core_id_embs: [BS, 16, 768].
+                    adaface_prompt_embs, core_id_embs = \
+                        arc2face_inverse_face_prompt_embs(self.clip_tokenizer,
+                                                          self.prompt2token_proj,
+                                                          arc2face_id_embs,
+                                                          list_extra_words=None,
+                                                          return_emb_types=return_emb_types,
+                                                          pad_embeddings=self.pad_embeddings,
+                                                          hidden_state_layer_weights=hidden_state_layer_weights,
+                                                          input_max_length=77, zs_extra_words_scale=self.zs_extra_words_scale)
+                # Reduce the update rate to prompt2token_proj.
+                adaface_prompt_embs = self.prompt2token_proj_grad_scaler(adaface_prompt_embs)
+                core_id_embs = self.prompt2token_proj_grad_scaler(core_id_embs)
+            elif raw_id_embs is not None:
+                # id_embs: [BS, 384] -> [BS, 18, 768].
+                # obj_proj_in is expected to project the DINO object features to
+                # the token embedding space. So no need to use prompt2token_proj.
+                id_embs = self.obj_proj_in(raw_id_embs)
+            else:
+                breakpoint()
+        else:
+            # Otherwise, context is the ad-hoc CLIP image features.
+            # id_embs: [BS, 257, 768].
+            id_embs = self.bg_proj_in(clip_features)
+        if self.placeholder_is_bg:
+            id_embs = id_embs + self.pos_embs_ln(self.pos_embs)
+            latent_queries = self.latent_queries_ln(self.latent_queries).repeat(BS, 1, 1)
+            # If bg, we don't have to use a specific attn layer for each 4-vec set. Instead, one attn layer can generate 257 embs,
+            # and we take the first 16*4=64.
+            # Output of prompt_translator is exactly num_out_embs == 64 tokens. id_embs_out: [BS, 64, 768].
+            # prompt_translator: better named as bg_prompt_translator. It maps the bg features
+            # to bg prompt embeddings.
+            with torch.set_grad_enabled(self.training):
+                id_embs_out = self.prompt_translator(latent_queries, id_embs)
+            # [BS, 64, 768] -> [BS, 16, 4, 768]
+            id_embs_out = id_embs_out.reshape(BS, self.num_out_layers, -1, self.output_dim)
+            adaface_subj_embs = id_embs_out * self.output_scale    # * 0.036
+        else:
+            # adaface_subj_embs: [BS, 16, 768] -> [BS, 1, 16, 768] -> [BS, 16, 16, 768]
+            adaface_subj_embs = core_id_embs.unsqueeze(1).repeat(1, self.num_out_layers, 1, 1)
+        # If out_id_embs_scale < 1, adaface_subj_embs is a mix of adaface_subj_embs and pad_embeddings.
+        if out_id_embs_scale != 1:
+            # pad_embeddings: [77, 768] -> [16, 768] -> [1, 1, 16, 768].
+            pad_embeddings = self.pad_embeddings[4:4+self.num_out_embs_per_layer].unsqueeze(0).unsqueeze(0)
+            adaface_subj_embs =   adaface_subj_embs * out_id_embs_scale \
+                                + pad_embeddings    * (1 - out_id_embs_scale)
+        return adaface_subj_embs, adaface_prompt_embs
+    def initialize_hidden_state_layer_weights(self, learnable_hidden_state_weights_scheme, device):
+        if learnable_hidden_state_weights_scheme == 'none':
+            self.hidden_state_layer_weights = None
+            # A grad scaler with alpha =1 is nn.Identity(), which outputs None given None as input.
+            self.hidden_state_layer_weights_grad_scaler = gen_gradient_scaler(1)
+            print("hidden_state_layer_weights is set to None.")
+        elif learnable_hidden_state_weights_scheme == 'per-layer':
+            # Learnable weights of the last 3 layers, initialized to putting more focus on the last layer.
+            # 'per-layer': Different weights for different layers, but the same for different channels.
+            # hidden_state_layer_weights: [3, 1].
+            self.hidden_state_layer_weights = nn.Parameter(torch.tensor([[1.0], [2.0], [4.0]], device=device),
+                                                            requires_grad=True)
+            self.hidden_state_layer_weights_grad_scaler = gen_gradient_scaler(5)
+            print("hidden_state_layer_weights initialized as per-layer [1, 2, 4], with grad scaler 5.")
+        else:
+            breakpoint()
+    def generate_pad_embeddings(self):
+        # clip_embeddings: CLIPTextEmbeddings instance. pad_embeddings is generated after
+        # prompt2token_proj is loaded from the finetuned weight. It seems such pad embeddings perform
+        # slightly better than the original pad embeddings.
+        clip_embeddings = self.prompt2token_proj.text_model.embeddings
+        # clip_embeddings() and clip_embeddings.token_embedding() differ in that
+        # clip_embeddings() adds positional embeddings, while clip_embeddings.token_embedding() doesn't.
+        # Adding positional embeddings seems to help somewhat.
+        # pad_tokens: pad_token_id 49407 repeated 77 times.
+        # pad_token_id is the EOS token. But BOS is 49406.
+        pad_tokens = torch.tensor([self.clip_tokenizer.pad_token_id]).to(clip_embeddings.token_embedding.weight.device).repeat(77)
+        # pad_embeddings: [77, 768].
+        pad_embeddings = clip_embeddings(pad_tokens)[0]
+        # We don't allow face recon to influence the pad embeddings.
+        # Otherwise, face identity will leak into the pad embeddings.
+        self.pad_embeddings = pad_embeddings.detach()
+    def extend_prompt2token_proj_attention(self, begin_layer_idx=-1, end_layer_idx=-1, multiplier=2, noise_std=0.1):
+        if multiplier > 1:
+            num_extended_layers = self.prompt2token_proj.extend_clip_attention_MKV_multiplier(begin_layer_idx, end_layer_idx, multiplier, noise_std)
+            self.prompt2token_proj_attention_multiplier = multiplier
+            print(f"{num_extended_layers} layers in prompt2token_proj_attention are x{multiplier}")
+    def freeze_prompt2token_proj(self):
+        # If bg, then prompt2token_proj is set to None. Therefore no need to freeze it.
+        # Then we don't have to check whether it's for subj or bg.
+        if self.prompt2token_proj is not None:
+            frozen_param_names = []
+            for param_name, param in self.prompt2token_proj.named_parameters():
+                if param.requires_grad:
+                    param.requires_grad = False
+                    frozen_param_names.append(param_name)
+                # If param is already frozen, then no need to freeze it again.
+            print(f"{len(frozen_param_names)} params in Subj prompt2token_proj is frozen.")
+            #print(f"Frozen parameters:\n{frozen_param_names}")
+    def __repr__(self):
+        type_sig = 'subj' if not self.placeholder_is_bg else 'bg'
+        # Fix compatability with the previous version.
+        if not hasattr(self, 'bg_prompt_translator_has_to_out_proj'):
+            self.bg_prompt_translator_has_to_out_proj = False
+        if not hasattr(self, 'num_out_embs'):
+            self.num_out_embs = -1
+        return f"{type_sig} SubjBasisGenerator: num_out_embs={self.num_out_embs}, " \
+               f"bg_prompt_translator_has_to_out_proj={self.bg_prompt_translator_has_to_out_proj}"
+@dataclass
+class BaseModelOutputWithPooling2(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token) after further processing
+            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
+            the classification token after processing through a linear layer and a tanh activation function. The linear
+            layer weights are trained from the next sentence prediction (classification) objective during pretraining.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    last_hidden_state: torch.FloatTensor = None
+    pooler_output: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    attn_mask: Optional[torch.FloatTensor] = None
+# Revised from CLIPVisionTransformer to support attention mask.
+# self: a CLIPVisionTransformer instance.
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_clip.py#L821
+# pixel_values: preprocessed B*C*H*W images. [BS, 3, 224, 224]
+# attn_mask: B*H*W attention mask.
+def CLIPVisionTransformer_forward(self, pixel_values = None, attn_mask=None,
+                                  output_attentions = None,
+                                  output_hidden_states = None, return_dict = None):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        # Visual tokens are flattended in embeddings().
+        # self.embeddings: CLIPVisionEmbeddings.
+        # hidden_states: [BS, 257, 1280]. 257: 16*16 (patch_embeds) + 1 (class_embeds).
+        # 16*16 is output from Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), bias=False).
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+        if attn_mask is not None:
+            # feat_edge_size: 16.
+            feat_edge_size = np.sqrt(hidden_states.shape[1] - 1).astype(int)
+            # attn_mask: [BS, 512, 512] -> [BS, 1, 16, 16].
+            attn_mask = F.interpolate(attn_mask.unsqueeze(1), size=(feat_edge_size, feat_edge_size), mode='nearest')
+            # Flatten the mask: [BS, 1, 16, 16] => [BS, 1, 256].
+            attn_mask = attn_mask.flatten(2)
+            # Prepend 1 to the mask: [BS, 1, 256] => [BS, 1, 257].
+            # This 1 corresponds to class_embeds, which is always attended to.
+            attn_mask = torch.cat([torch.ones_like(attn_mask[:, :, :1]), attn_mask], dim=-1)
+            attn_mask_pairs = torch.matmul(attn_mask.transpose(-1, -2), attn_mask).unsqueeze(1)
+        else:
+            attn_mask_pairs = None
+        # encoder: CLIPEncoder.
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            # New feature: (***The official documentation is wrong***)
+            # attention_mask (`torch.Tensor` of shape `(batch_size, 1, sequence_length, sequence_length)`, *optional*):
+            #                 Mask to avoid performing attention on pairs of token. Mask values selected in `[0, 1]`:
+            #                 - 1 for pairs that are **not masked**,
+            #                 - 0 for pairs that are **masked**.
+            # attention_mask is eventually used by CLIPEncoderLayer:
+            # https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_clip.py#L370
+            attention_mask=attn_mask_pairs,
+            output_attentions=output_attentions,        # False
+            output_hidden_states=output_hidden_states,  # True
+            return_dict=return_dict,                    # True
+        )
+        # last_hidden_state: [BS, 257, 1280]
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+        # return_dict is True.
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling2(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            # Newly added: return resized flattened attention mask.
+            # [BS, 1, 257] -> [BS, 257, 1]
+            attn_mask=attn_mask.permute(0, 2, 1) if attn_mask is not None else None
+        )
+class CLIPVisionModelWithMask(CLIPVisionModel):
+    def __init__(self, config):
+        super().__init__(config)
+        # Replace vision_model.forward() with the new one that supports mask.
+        self.vision_model.forward = CLIPVisionTransformer_forward.__get__(self.vision_model)
+    def forward(self, pixel_values = None, attn_mask = None, output_attentions = None,
+                output_hidden_states = None, return_dict = None):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return self.vision_model(
+            pixel_values=pixel_values,
+            attn_mask=attn_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )

adaface/util.py ADDED Viewed

	@@ -0,0 +1,342 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+import cv2
+# add_noise_to_tensor() adds a fixed amount of noise to the tensor.
+def add_noise_to_tensor(ts, noise_std, noise_std_is_relative=True, keep_norm=False,
+                        std_dim=-1, norm_dim=-1):
+    if noise_std_is_relative:
+        ts_std_mean = ts.std(dim=std_dim).mean().detach()
+        noise_std *= ts_std_mean
+    noise = torch.randn_like(ts) * noise_std
+    if keep_norm:
+        orig_norm = ts.norm(dim=norm_dim, keepdim=True)
+        ts = ts + noise
+        new_norm  = ts.norm(dim=norm_dim, keepdim=True).detach()
+        ts = ts * orig_norm / (new_norm + 1e-8)
+    else:
+        ts = ts + noise
+    return ts
+# Revised from RevGrad, by removing the grad negation.
+class ScaleGrad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input_, alpha_, debug=False):
+        ctx.save_for_backward(alpha_, debug)
+        output = input_
+        if debug:
+            print(f"input: {input_.abs().mean().item()}")
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):  # pragma: no cover
+        # saved_tensors returns a tuple of tensors.
+        alpha_, debug = ctx.saved_tensors
+        if ctx.needs_input_grad[0]:
+            grad_output2 = grad_output * alpha_
+            if debug:
+                print(f"grad_output2: {grad_output2.abs().mean().item()}")
+        else:
+            grad_output2 = None
+        return grad_output2, None, None
+class GradientScaler(nn.Module):
+    def __init__(self, alpha=1., debug=False, *args, **kwargs):
+        """
+        A gradient scaling layer.
+        This layer has no parameters, and simply scales the gradient in the backward pass.
+        """
+        super().__init__(*args, **kwargs)
+        self._alpha = torch.tensor(alpha, requires_grad=False)
+        self._debug = torch.tensor(debug, requires_grad=False)
+    def forward(self, input_):
+        _debug = self._debug if hasattr(self, '_debug') else False
+        return ScaleGrad.apply(input_, self._alpha.to(input_.device), _debug)
+def gen_gradient_scaler(alpha, debug=False):
+    if alpha == 1:
+        return nn.Identity()
+    if alpha > 0:
+        return GradientScaler(alpha, debug=debug)
+    else:
+        assert alpha == 0
+        # Don't use lambda function here, otherwise the object can't be pickled.
+        return torch.detach
+#@torch.autocast(device_type="cuda")
+# In AdaFaceWrapper, input_max_length is 22.
+def arc2face_forward_face_embs(tokenizer, arc2face_text_encoder, face_embs,
+                               input_max_length=77, return_full_and_core_embs=True):
+    '''
+    arc2face_text_encoder: arc2face_models.py CLIPTextModelWrapper instance.
+    face_embs: (N, 512) normalized ArcFace embeddings.
+    return_full_and_core_embs: Return both the full prompt embeddings and the core embeddings.
+                               If False, return only the core embeddings.
+    '''
+    # arcface_token_id: 1014
+    arcface_token_id = tokenizer.encode("id", add_special_tokens=False)[0]
+    # This step should be quite fast, and there's no need to cache the input_ids.
+    input_ids = tokenizer(
+            "photo of a id person",
+            truncation=True,
+            padding="max_length",
+            max_length=input_max_length, #tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids.to(face_embs.device)
+    # input_ids: [1, 77] or [3, 77] (during training).
+    input_ids = input_ids.repeat(len(face_embs), 1)
+    face_embs_dtype = face_embs.dtype
+    face_embs = face_embs.to(arc2face_text_encoder.dtype)
+    # face_embs_padded: [1, 512] -> [1, 768].
+    face_embs_padded = F.pad(face_embs, (0, arc2face_text_encoder.config.hidden_size - face_embs.shape[-1]), "constant", 0)
+    # arc2face_text_encoder(input_ids=input_ids, ...) is called twice. The first is only to get the token embeddings (the shallowest mapping).
+    # The second call does the ordinary CLIP text encoding pass.
+    token_embs = arc2face_text_encoder(input_ids=input_ids, return_token_embs=True)
+    token_embs[input_ids==arcface_token_id] = face_embs_padded
+    prompt_embeds = arc2face_text_encoder(
+        input_ids=input_ids,
+        input_token_embs=token_embs,
+        return_token_embs=False
+    )[0]
+    # Restore the original dtype of prompt_embeds: float16 -> float32.
+    prompt_embeds = prompt_embeds.to(face_embs_dtype)
+    if return_full_and_core_embs:
+        # token 4: 'id' in "photo of a id person".
+        # 4:20 are the most important 16 embeddings that contain the subject's identity.
+        # [N, 77, 768] -> [N, 16, 768]
+        return prompt_embeds, prompt_embeds[:, 4:20]
+    else:
+        # [N, 16, 768]
+        return prompt_embeds[:, 4:20]
+def get_b_core_e_embeddings(prompt_embeds, length=22):
+    b_core_e_embs = torch.cat([ prompt_embeds[:, :length], prompt_embeds[:, [-1]] ], dim=1)
+    return b_core_e_embs
+# return_emb_types: a list of strings, each string is among ['full', 'core', 'full_zeroed_extra', 'b_core_e'].
+def arc2face_inverse_face_prompt_embs(clip_tokenizer, inverse_text_encoder, face_prompt_embs, list_extra_words,
+                                      return_emb_types, pad_embeddings, hidden_state_layer_weights=None,
+                                      input_max_length=77, zs_extra_words_scale=0.5):
+    '''
+    inverse_text_encoder: arc2face_models.py CLIPTextModelWrapper instance with **custom weights**.
+    inverse_text_encoder is NOT the original arc2face text encoder, but retrained to do inverse mapping.
+    face_prompt_embs: (BS, 16, 768). Only the core embeddings, no paddings.
+    list_extra_words: [s_1, ..., s_BS], each s_i is a list of extra words to be added to the prompt.
+    return_full_and_core_embs: Return both the full prompt embeddings and the core embeddings.
+                               If False, return only the core embeddings.
+    '''
+    if list_extra_words is not None:
+        if len(list_extra_words) != len(face_prompt_embs):
+            if len(face_prompt_embs) > 1:
+                print("Warn: list_extra_words has different length as face_prompt_embs.")
+                if len(list_extra_words) == 1:
+                    list_extra_words = list_extra_words * len(face_prompt_embs)
+                else:
+                    breakpoint()
+            else:
+                # len(face_prompt_embs) == 1, this occurs when same_subject_in_batch == True, e.g. in do_mix_prompt_distillation.
+                # But list_extra_words always corresponds to the actual batch size. So we only take the first element.
+                list_extra_words = list_extra_words[:1]
+        for extra_words in list_extra_words:
+            assert len(extra_words.split()) <= 2, "Each extra_words string should consist of at most 2 words."
+        # 16 ", " are placeholders for face_prompt_embs.
+        prompt_templates = [ "photo of a " + ", " * 16 + list_extra_words[i] for i in range(len(list_extra_words)) ]
+    else:
+        # 16 ", " are placeholders for face_prompt_embs.
+        # No extra words are added to the prompt.
+        prompt_templates = [ "photo of a " + ", " * 16 for _ in range(len(face_prompt_embs)) ]
+    # This step should be quite fast, and there's no need to cache the input_ids.
+    # input_ids: [BS, 77].
+    input_ids = clip_tokenizer(
+            prompt_templates,
+            truncation=True,
+            padding="max_length",
+            max_length=input_max_length,
+            return_tensors="pt",
+        ).input_ids.to(face_prompt_embs.device)
+    face_prompt_embs_dtype  = face_prompt_embs.dtype
+    face_prompt_embs        = face_prompt_embs.to(inverse_text_encoder.dtype)
+    # token_embs: [1, 77, 768]. This call is only to get the template token embeddings (the shallowest mapping).
+    token_embs = inverse_text_encoder(input_ids=input_ids, return_token_embs=True)
+    # token 4: first ", " in the template prompt.
+    # Replace embeddings of 16 placeholder ", " with face_prompt_embs.
+    token_embs[:, 4:20] = face_prompt_embs
+    # This call does the ordinary CLIP text encoding pass.
+    prompt_embeds = inverse_text_encoder(
+        input_ids=input_ids,
+        input_token_embs=token_embs,
+        hidden_state_layer_weights=hidden_state_layer_weights,
+        return_token_embs=False
+    )[0]
+    # Restore the original dtype of prompt_embeds: float16 -> float32.
+    prompt_embeds = prompt_embeds.to(face_prompt_embs_dtype)
+    # token 4: first ", " in the template prompt.
+    # 4:20 are the most important 16 embeddings that contain the subject's identity.
+    # 20:22 are embeddings of the (at most) two extra words.
+    # [N, 77, 768] -> [N, 16, 768]
+    core_prompt_embs = prompt_embeds[:, 4:20]
+    if list_extra_words is not None:
+        # [N, 16, 768] -> [N, 18, 768]
+        extra_words_embs = prompt_embeds[:, 20:22] * zs_extra_words_scale
+        core_prompt_embs = torch.cat([core_prompt_embs, extra_words_embs], dim=1)
+    return_prompts = []
+    for emb_type in return_emb_types:
+        if emb_type == 'full':
+            return_prompts.append(prompt_embeds)
+        elif emb_type == 'full_half_pad':
+            prompt_embeds2 = prompt_embeds.clone()
+            PADS  = prompt_embeds2.shape[1] - 23
+            if PADS >= 2:
+                # Fill half of the remaining embeddings with pad embeddings.
+                prompt_embeds2[:, 22:22+PADS//2] = pad_embeddings[22:22+PADS//2]
+            return_prompts.append(prompt_embeds2)
+        elif emb_type == 'full_pad':
+            prompt_embeds2 = prompt_embeds.clone()
+            # Fill the 22nd to the second last embeddings with pad embeddings.
+            prompt_embeds2[:, 22:-1] = pad_embeddings[22:-1]
+            return_prompts.append(prompt_embeds2)
+        elif emb_type == 'core':
+            return_prompts.append(core_prompt_embs)
+        elif emb_type == 'full_zeroed_extra':
+            prompt_embeds2 = prompt_embeds.clone()
+            # Only add two pad embeddings. The remaining embeddings are set to 0.
+            # Make the positional embeddings align with the actual positions.
+            prompt_embeds2[:, 22:24] = pad_embeddings[22:24]
+            prompt_embeds2[:, 24:-1] = 0
+            return_prompts.append(prompt_embeds2)
+        elif emb_type == 'b_core_e':
+            # The first 22 embeddings, plus the last EOS embedding.
+            b_core_e_embs = get_b_core_e_embeddings(prompt_embeds, length=22)
+            return_prompts.append(b_core_e_embs)
+        else:
+            breakpoint()
+    return return_prompts
+# if pre_face_embs is None, generate random face embeddings [BS, 512].
+# image_folder is passed only for logging purpose. image_paths contains the paths of the images.
+def get_arc2face_id_prompt_embs(face_app, clip_tokenizer, arc2face_text_encoder,
+                                extract_faceid_embeds, pre_face_embs,
+                                image_folder, image_paths, images_np,
+                                id_batch_size, device,
+                                input_max_length=77, noise_level=0.0,
+                                return_core_id_embs=False,
+                                gen_neg_prompt=False, verbose=False):
+    face_image_count = 0
+    if extract_faceid_embeds:
+        faceid_embeds = []
+        if image_paths is not None:
+            images_np = []
+            for image_path in image_paths:
+                image_np = np.array(Image.open(image_path))
+                images_np.append(image_np)
+        for i, image_np in enumerate(images_np):
+            image_obj = Image.fromarray(image_np).resize((512, 512), Image.NEAREST)
+            # Remove alpha channel if it exists.
+            if image_obj.mode == 'RGBA':
+                image_obj = image_obj.convert('RGB')
+            # This seems NOT a bug. The input image should be in BGR format, as per
+            # https://github.com/deepinsight/insightface/issues/524
+            image_np = cv2.cvtColor(np.array(image_obj), cv2.COLOR_RGB2BGR)
+            image_np = np.array(image_obj)
+            face_infos = face_app.get(image_np)
+            if verbose and image_paths is not None:
+                print(image_paths[i], len(face_infos))
+            # Assume all images belong to the same subject. Therefore, we can skip the images with no face detected.
+            if len(face_infos) == 0:
+                continue
+            # only use the maximum face
+            face_info = sorted(face_infos, key=lambda x:(x['bbox'][2]-x['bbox'][0])*x['bbox'][3]-x['bbox'][1])[-1]
+            # Each faceid_embed: [1, 512]
+            faceid_embeds.append(torch.from_numpy(face_info.normed_embedding).unsqueeze(0))
+            face_image_count += 1
+        if verbose:
+            if image_folder is not None:
+                print(f"Extracted ID embeddings from {face_image_count} images in {image_folder}")
+            else:
+                print(f"Extracted ID embeddings from {face_image_count} images")
+        if len(faceid_embeds) == 0:
+            print("No face detected. Use a random face instead.")
+            faceid_embeds = torch.randn(id_batch_size, 512).to(device=device, dtype=torch.float16)
+        else:
+            # faceid_embeds: [10, 512]
+            faceid_embeds = torch.cat(faceid_embeds, dim=0)
+            # faceid_embeds: [10, 512] -> [1, 512].
+            # and the resulted prompt embeddings are the same.
+            faceid_embeds = faceid_embeds.mean(dim=0, keepdim=True).to(device=device, dtype=torch.float16)
+    else:
+        # Random face embeddings. faceid_embeds: [BS, 512].
+        if pre_face_embs is None:
+            faceid_embeds = torch.randn(id_batch_size, 512)
+        else:
+            faceid_embeds = pre_face_embs
+            if pre_face_embs.shape[0] == 1:
+                faceid_embeds = faceid_embeds.repeat(id_batch_size, 1)
+        faceid_embeds = faceid_embeds.to(device=device, dtype=torch.float16)
+    if noise_level > 0:
+        # If id_batch_size > 1, after adding noises, the id_batch_size embeddings will be different.
+        faceid_embeds = add_noise_to_tensor(faceid_embeds, noise_level, noise_std_is_relative=True, keep_norm=True)
+    faceid_embeds = F.normalize(faceid_embeds, p=2, dim=-1)
+    # arc2face_pos_prompt_emb, arc2face_neg_prompt_emb: [BS, 77, 768]
+    with torch.no_grad():
+        arc2face_pos_prompt_emb, arc2face_pos_core_prompt_emb  = \
+             arc2face_forward_face_embs(clip_tokenizer, arc2face_text_encoder,
+                                        faceid_embeds, input_max_length=input_max_length,
+                                        return_full_and_core_embs=True)
+        if return_core_id_embs:
+            arc2face_pos_prompt_emb = arc2face_pos_core_prompt_emb
+    # If extract_faceid_embeds, we assume all images are from the same subject, and the batch dim of faceid_embeds is 1.
+    # So we need to repeat faceid_embeds.
+    if extract_faceid_embeds:
+        faceid_embeds = faceid_embeds.repeat(id_batch_size, 1)
+        arc2face_pos_prompt_emb = arc2face_pos_prompt_emb.repeat(id_batch_size, 1, 1)
+    if gen_neg_prompt:
+        with torch.no_grad():
+            arc2face_neg_prompt_emb, arc2face_neg_core_prompt_emb = \
+                arc2face_forward_face_embs(clip_tokenizer, arc2face_text_encoder,
+                                           torch.zeros_like(faceid_embeds),
+                                           input_max_length=input_max_length,
+                                           return_full_and_core_embs=True)
+            if return_core_id_embs:
+                arc2face_neg_prompt_emb = arc2face_neg_core_prompt_emb
+        #if extract_faceid_embeds:
+        #    arc2face_neg_prompt_emb = arc2face_neg_prompt_emb.repeat(id_batch_size, 1, 1)
+        return face_image_count, faceid_embeds, arc2face_pos_prompt_emb, arc2face_neg_prompt_emb
+    else:
+        return face_image_count, faceid_embeds, arc2face_pos_prompt_emb

adaface_wrapper.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import torch
+import torch.nn as nn
+from transformers import CLIPTextModel
+from diffusers import (
+    StableDiffusionPipeline,
+    StableDiffusionImg2ImgPipeline,
+    UNet2DConditionModel,
+    DDIMScheduler,
+    AutoencoderKL,
+)
+from insightface.app import FaceAnalysis
+from adaface.arc2face_models import CLIPTextModelWrapper
+from adaface.util import get_arc2face_id_prompt_embs
+import re, os
+import sys
+sys.modules['ldm'] = sys.modules['adaface']
+class AdaFaceWrapper(nn.Module):
+    def __init__(self, pipeline_name, base_model_path, adaface_ckpt_path, device,
+                 subject_string='z', num_vectors=16,
+                 num_inference_steps=50, negative_prompt=None,
+                 use_840k_vae=False, use_ds_text_encoder=False, is_training=False):
+        '''
+        pipeline_name: "text2img" or "img2img" or None. If None, the unet and vae are
+        removed from the pipeline to release RAM.
+        '''
+        super().__init__()
+        self.pipeline_name = pipeline_name
+        self.base_model_path = base_model_path
+        self.adaface_ckpt_path = adaface_ckpt_path
+        self.use_840k_vae = use_840k_vae
+        self.use_ds_text_encoder = use_ds_text_encoder
+        self.subject_string = subject_string
+        self.num_vectors = num_vectors
+        self.num_inference_steps = num_inference_steps
+        self.device = device
+        self.is_training = is_training
+        self.initialize_pipeline()
+        self.extend_tokenizer_and_text_encoder()
+        if negative_prompt is None:
+            self.negative_prompt = \
+            "flaws in the eyes, flaws in the face, lowres, non-HDRi, low quality, worst quality, artifacts, noise, text, watermark, glitch, " \
+            "mutated, ugly, disfigured, hands, partially rendered objects, partially rendered eyes, deformed eyeballs, cross-eyed, blurry, " \
+            "mutation, duplicate, out of frame, cropped, mutilated, bad anatomy, deformed, bad proportions, " \
+            "nude, naked, nsfw, topless, bare breasts"
+        else:
+            self.negative_prompt = negative_prompt
+    def load_subj_basis_generator(self, adaface_ckpt_path):
+        ckpt = torch.load(adaface_ckpt_path, map_location='cpu')
+        string_to_subj_basis_generator_dict = ckpt["string_to_subj_basis_generator_dict"]
+        if self.subject_string not in string_to_subj_basis_generator_dict:
+            print(f"Subject '{self.subject_string}' not found in the embedding manager.")
+            breakpoint()
+        self.subj_basis_generator = string_to_subj_basis_generator_dict[self.subject_string]
+        # In the original ckpt, num_out_layers is 16 for layerwise embeddings.
+        # But we don't do layerwise embeddings here, so we set it to 1.
+        self.subj_basis_generator.num_out_layers = 1
+        print(f"Loaded subject basis generator for '{self.subject_string}'.")
+        print(repr(self.subj_basis_generator))
+        self.subj_basis_generator.to(self.device)
+        if self.is_training:
+            self.subj_basis_generator.train()
+        else:
+            self.subj_basis_generator.eval()
+    def initialize_pipeline(self):
+        self.load_subj_basis_generator(self.adaface_ckpt_path)
+        # arc2face_text_encoder maps the face analysis embedding to 16 face embeddings
+        # in the UNet image space.
+        arc2face_text_encoder = CLIPTextModelWrapper.from_pretrained(
+            'models/arc2face', subfolder="encoder", torch_dtype=torch.float16
+        )
+        self.arc2face_text_encoder = arc2face_text_encoder.to(self.device)
+        if self.use_840k_vae:
+            # The 840000-step vae model is slightly better in face details than the original vae model.
+            # https://huggingface.co/stabilityai/sd-vae-ft-mse-original
+            vae = AutoencoderKL.from_single_file("models/diffusers/sd-vae-ft-mse-original/vae-ft-mse-840000-ema-pruned.ckpt", torch_dtype=torch.float16)
+        else:
+            vae = None
+        if self.use_ds_text_encoder:
+            # The dreamshaper v7 finetuned text encoder follows the prompt slightly better than the original text encoder.
+            # https://huggingface.co/Lykon/DreamShaper/tree/main/text_encoder
+            text_encoder = CLIPTextModel.from_pretrained("models/ds_text_encoder", torch_dtype=torch.float16)
+        else:
+            text_encoder = None
+        remove_unet = False
+        if self.pipeline_name == "img2img":
+            PipelineClass = StableDiffusionImg2ImgPipeline
+        elif self.pipeline_name == "text2img":
+            PipelineClass = StableDiffusionPipeline
+        # pipeline_name is None means only use this instance to generate adaface embeddings, not to generate images.
+        elif self.pipeline_name is None:
+            PipelineClass = StableDiffusionPipeline
+            remove_unet = True
+        else:
+            raise ValueError(f"Unknown pipeline name: {self.pipeline_name}")
+        if os.path.isfile(self.base_model_path):
+            pipeline = PipelineClass.from_single_file(
+                self.base_model_path,
+                torch_dtype=torch.float16
+                )
+        else:
+            pipeline = PipelineClass.from_pretrained(
+                    self.base_model_path,
+                    torch_dtype=torch.float16,
+                    safety_checker=None
+                )
+        print(f"Loaded pipeline from {self.base_model_path}.")
+        if self.use_840k_vae:
+            pipeline.vae = vae
+            print("Replaced the VAE with the 840k-step VAE.")
+        if self.use_ds_text_encoder:
+            pipeline.text_encoder = text_encoder
+            print("Replaced the text encoder with the DreamShaper text encoder.")
+        if remove_unet:
+            # Remove unet and vae to release RAM. Only keep tokenizer and text_encoder.
+            pipeline.unet = None
+            pipeline.vae  = None
+            print("Removed UNet and VAE from the pipeline.")
+        noise_scheduler = DDIMScheduler(
+            num_train_timesteps=1000,
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            clip_sample=False,
+            set_alpha_to_one=False,
+            steps_offset=1,
+        )
+        pipeline.scheduler = noise_scheduler
+        self.pipeline = pipeline.to(self.device)
+        # FaceAnalysis will try to find the ckpt in: models/insightface/models/antelopev2.
+        # Note there's a second "model" in the path.
+        self.face_app = FaceAnalysis(name='antelopev2', root='models/insightface', providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
+        self.face_app.prepare(ctx_id=0, det_size=(512, 512))
+        # Patch the missing tokenizer in the subj_basis_generator.
+        if not hasattr(self.subj_basis_generator, 'clip_tokenizer'):
+            self.subj_basis_generator.clip_tokenizer = self.pipeline.tokenizer
+            print("Patched the missing tokenizer in the subj_basis_generator.")
+    def extend_tokenizer_and_text_encoder(self):
+        if self.num_vectors < 1:
+            raise ValueError(f"num_vectors has to be larger or equal to 1, but is {self.num_vectors}")
+        tokenizer = self.pipeline.tokenizer
+        # Add z0, z1, z2, ..., z15.
+        self.placeholder_tokens = []
+        for i in range(0, self.num_vectors):
+            self.placeholder_tokens.append(f"{self.subject_string}_{i}")
+        self.placeholder_tokens_str = " ".join(self.placeholder_tokens)
+        # Add the new tokens to the tokenizer.
+        num_added_tokens = tokenizer.add_tokens(self.placeholder_tokens)
+        if num_added_tokens != self.num_vectors:
+            raise ValueError(
+                f"The tokenizer already contains the token {self.subject_string}. Please pass a different"
+                " `subject_string` that is not already in the tokenizer.")
+        print(f"Added {num_added_tokens} tokens ({self.placeholder_tokens_str}) to the tokenizer.")
+        # placeholder_token_ids: [49408, ..., 49423].
+        self.placeholder_token_ids = tokenizer.convert_tokens_to_ids(self.placeholder_tokens)
+        # print(self.placeholder_token_ids)
+        # Resize the token embeddings as we are adding new special tokens to the tokenizer
+        old_weight = self.pipeline.text_encoder.get_input_embeddings().weight
+        self.pipeline.text_encoder.resize_token_embeddings(len(tokenizer))
+        new_weight = self.pipeline.text_encoder.get_input_embeddings().weight
+        print(f"Resized text encoder token embeddings from {old_weight.shape} to {new_weight.shape} on {new_weight.device}.")
+    # Extend pipeline.text_encoder with the adaface subject emeddings.
+    # subj_embs: [16, 768].
+    def update_text_encoder_subj_embs(self, subj_embs):
+        # Initialise the newly added placeholder token with the embeddings of the initializer token
+        token_embeds = self.pipeline.text_encoder.get_input_embeddings().weight.data
+        with torch.no_grad():
+            for i, token_id in enumerate(self.placeholder_token_ids):
+                token_embeds[token_id] = subj_embs[i]
+            print(f"Updated {len(self.placeholder_token_ids)} tokens ({self.placeholder_tokens_str}) in the text encoder.")
+    def update_prompt(self, prompt):
+        # If the placeholder tokens are already in the prompt, then return the prompt as is.
+        if self.placeholder_tokens_str in prompt:
+            return prompt
+        # If the subject string 'z' is not in the prompt, then simply prepend the placeholder tokens to the prompt.
+        if re.search(r'\b' + self.subject_string + r'\b', prompt) is None:
+            print(f"Subject string '{self.subject_string}' not found in the prompt. Adding it.")
+            comp_prompt = self.placeholder_tokens_str + " " + prompt
+        else:
+            # Replace the subject string 'z' with the placeholder tokens.
+            comp_prompt = re.sub(r'\b' + self.subject_string + r'\b', self.placeholder_tokens_str, prompt)
+        return comp_prompt
+    # image_paths: a list of image paths. image_folder: the parent folder name.
+    def generate_adaface_embeddings(self, image_paths, image_folder=None,
+                                    pre_face_embs=None, gen_rand_face=False,
+                                    out_id_embs_scale=1., noise_level=0, update_text_encoder=True):
+        # faceid_embeds is a batch of extracted face analysis embeddings (BS * 512 = id_batch_size * 512).
+        # If extract_faceid_embeds is True, faceid_embeds is *the same* embedding repeated by id_batch_size times.
+        # Otherwise, faceid_embeds is a batch of random embeddings, each instance is different.
+        # The same applies to id_prompt_emb.
+        # faceid_embeds is in the face analysis embeddings. id_prompt_emb is in the image prompt space.
+        # Here id_batch_size = 1, so
+        # faceid_embeds: [1, 512]. NOT used later.
+        # id_prompt_emb: [1, 16, 768].
+        # NOTE: Since return_core_id_embs is True, id_prompt_emb is only the 16 core ID embeddings.
+        # arc2face prompt template: "photo of a id person"
+        # ID embeddings start from "id person ...". So there are 3 template tokens before the 16 ID embeddings.
+        face_image_count, faceid_embeds, id_prompt_emb \
+            = get_arc2face_id_prompt_embs(self.face_app, self.pipeline.tokenizer, self.arc2face_text_encoder,
+                                          extract_faceid_embeds=not gen_rand_face,
+                                          pre_face_embs=pre_face_embs,
+                                          # image_folder is passed only for logging purpose.
+                                          # image_paths contains the paths of the images.
+                                          image_folder=image_folder, image_paths=image_paths,
+                                          images_np=None,
+                                          id_batch_size=1,
+                                          device=self.device,
+                                          # input_max_length == 22: only keep the first 22 tokens,
+                                          # including 3 template tokens and 16 ID tokens, and BOS and EOS tokens.
+                                          # The results are indistinguishable from input_max_length=77.
+                                          input_max_length=22,
+                                          noise_level=noise_level,
+                                          return_core_id_embs=True,
+                                          gen_neg_prompt=False,
+                                          verbose=True)
+        if face_image_count == 0:
+            return None
+        # adaface_subj_embs: [1, 1, 16, 768].
+        # adaface_prompt_embs: [1, 77, 768] (not used).
+        adaface_subj_embs, adaface_prompt_embs = \
+            self.subj_basis_generator(id_prompt_emb, None, None,
+                                      out_id_embs_scale=out_id_embs_scale,
+                                      is_face=True, is_training=False,
+                                      adaface_prompt_embs_inf_type='full_half_pad')
+        # adaface_subj_embs: [16, 768]
+        adaface_subj_embs = adaface_subj_embs.squeeze()
+        if update_text_encoder:
+            self.update_text_encoder_subj_embs(adaface_subj_embs)
+        return adaface_subj_embs
+    def encode_prompt(self, prompt, negative_prompt=None, device="cuda", verbose=False):
+        if negative_prompt is None:
+            negative_prompt = self.negative_prompt
+        prompt = self.update_prompt(prompt)
+        if verbose:
+            print(f"Prompt: {prompt}")
+        # For some unknown reason, the text_encoder is still on CPU after self.pipeline.to(self.device).
+        # So we manually move it to GPU here.
+        self.pipeline.text_encoder.to(device)
+        # prompt_embeds_, negative_prompt_embeds_: [1, 77, 768]
+        prompt_embeds_, negative_prompt_embeds_ = \
+            self.pipeline.encode_prompt(prompt, device=device, num_images_per_prompt=1,
+                                        do_classifier_free_guidance=True, negative_prompt=negative_prompt)
+        return prompt_embeds_, negative_prompt_embeds_
+    # ref_img_strength is used only in the img2img pipeline.
+    def forward(self, noise, prompt, negative_prompt=None, guidance_scale=4.0,
+                out_image_count=4, ref_img_strength=0.8, generator=None, verbose=False):
+        if negative_prompt is None:
+            negative_prompt = self.negative_prompt
+        # prompt_embeds_, negative_prompt_embeds_: [1, 77, 768]
+        prompt_embeds_, negative_prompt_embeds_ = self.encode_prompt(prompt, negative_prompt, device=self.device, verbose=verbose)
+        # Repeat the prompt embeddings for all images in the batch.
+        prompt_embeds_          = prompt_embeds_.repeat(out_image_count, 1, 1)
+        negative_prompt_embeds_ = negative_prompt_embeds_.repeat(out_image_count, 1, 1)
+        noise = noise.to(self.device).to(torch.float16)
+        # noise: [BS, 4, 64, 64]
+        # When the pipeline is text2img, strength is ignored.
+        images = self.pipeline(image=noise,
+                               prompt_embeds=prompt_embeds_,
+                               negative_prompt_embeds=negative_prompt_embeds_,
+                               num_inference_steps=self.num_inference_steps,
+                               guidance_scale=guidance_scale,
+                               num_images_per_prompt=1,
+                               strength=ref_img_strength,
+                               generator=generator).images
+        # images: [BS, 3, 512, 512]
+        return images

app.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import sys
+sys.path.append('./')
+from adaface.adaface_wrapper import AdaFaceWrapper
+import torch
+from insightface.app import FaceAnalysis
+from PIL import Image
+import numpy as np
+import random
+import gradio as gr
+import spaces
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--adaface_ckpt_path', type=str,
+                    default='models/adaface/subjects-celebrity2024-05-16T17-22-46_zero3-ada-30000.pt')
+parser.add_argument('--gpu', type=int, default=None)
+parser.add_argument('--ip', type=str, default="0.0.0.0")
+args = parser.parse_args()
+# global variable
+MAX_SEED = np.iinfo(np.int32).max
+if torch.cuda.is_available():
+    device = "cuda" if args.gpu is None else f"cuda:{args.gpu}"
+else:
+    device = "cpu"
+dtype = torch.float16
+# base_model_path is only used for initialization, not really used in the inference.
+adaface = AdaFaceWrapper(pipeline_name="text2img", base_model_path="models/sar/sar.safetensors",
+                         adaface_ckpt_path=args.adaface_ckpt_path, device=device)
+def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    return seed
+def swap_to_gallery(images):
+    # Update uploaded_files_gallery, show files, hide clear_button_column
+    # Or:
+    # Update uploaded_init_img_gallery, show init_img_files, hide init_clear_button_column
+    return gr.update(value=images, visible=True), gr.update(visible=True), gr.update(value=images, visible=False)
+def remove_back_to_files():
+    # Hide uploaded_files_gallery,    show clear_button_column,      hide files,           reset init_img_selected_idx
+    # Or:
+    # Hide uploaded_init_img_gallery, hide init_clear_button_column, show init_img_files,  reset init_img_selected_idx
+    return gr.update(visible=False), gr.update(visible=False), gr.update(value=None, visible=True)
+def update_out_gallery(images):
+    #rows = (len(images) + 1) // 2  # Calculate the number of rows needed
+    return gr.update(height=600)
+@spaces.GPU
+def generate_image(image_paths, guidance_scale, adaface_id_cfg_scale,
+                   num_images, prompt, negative_prompt, seed, progress=gr.Progress(track_tqdm=True)):
+    if image_paths is None or len(image_paths) == 0:
+        raise gr.Error(f"Cannot find any input face image! Please upload a face image.")
+    if prompt is None:
+        prompt = ""
+    adaface_subj_embs = \
+        adaface.generate_adaface_embeddings(image_folder=None, image_paths=image_paths,
+                                            out_id_embs_scale=adaface_id_cfg_scale, update_text_encoder=True)
+    if adaface_subj_embs is None:
+        raise gr.Error(f"Failed to detect any faces! Please try with other images")
+    generator = torch.Generator(device=device).manual_seed(seed)
+    print(f"Manual seed: {seed}")
+    # Generate two images each time for the user to select from.
+    noise = torch.randn(num_images, 3, 512, 512, device=device, generator=generator)
+    #print(noise.abs().sum())
+    # samples: A list of PIL Image instances.
+    samples = adaface(noise, prompt, negative_prompt, guidance_scale=guidance_scale, out_image_count=num_images, generator=generator, verbose=True)
+    return samples
+### Description
+title = r"""
+<h1>AdaFace: A Versatile Face Encoder for Zero-Shot Diffusion Model Personalization</h1>
+"""
+description = r"""
+<b>Official demo</b> for our NeurIPS 2024 submission <b>AdaFace: A Versatile Face Encoder for Zero-Shot Diffusion Model Personalization</b>.<br>
+❗️**Tips**❗️
+1. Upload one or more images of a person. If multiple faces are detected, we use the largest one.
+2. Increase <b>AdaFace CFG Scale</b> (preferred) or <b>Guidance scale</b> and/or to highlight fine facial features.
+3. AdaFace Text-to-Video: <a href="https://huggingface.co/spaces/adaface-neurips/adaface-animate" style="display: inline-flex; align-items: center;">
+  AdaFace-Animate
+  <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-yellow" alt="Hugging Face Spaces" style="margin-left: 5px;">
+</a>
+**TODO**
+- ControlNet integration.
+"""
+css = '''
+.gradio-container {width: 85% !important}
+'''
+with gr.Blocks(css=css) as demo:
+    # description
+    gr.Markdown(title)
+    gr.Markdown(description)
+    with gr.Row():
+        with gr.Column():
+            # upload face image
+            # img_file = gr.Image(label="Upload a photo with a face", type="filepath")
+            img_files = gr.File(
+                        label="Drag / Select 1 or more photos of a person's face",
+                        file_types=["image"],
+                        file_count="multiple"
+                    )
+            uploaded_files_gallery = gr.Gallery(label="Subject images", visible=False, columns=3, rows=1, height=300)
+            with gr.Column(visible=False) as clear_button_column:
+                remove_and_reupload = gr.ClearButton(value="Remove and upload subject images", components=img_files, size="sm")
+            prompt = gr.Dropdown(label="Prompt",
+                       info="Try something like 'man/woman walking on the beach'. If the face is not in focus, try adding 'face portrait of' at the beginning.",
+                       value=None,
+                       allow_custom_value=True,
+                       filterable=False,
+                       choices=[
+                            "woman ((best quality)), ((masterpiece)), ((realistic)), long highlighted hair, futuristic silver armor suit, confident stance, high-resolution, living room, smiling, head tilted, perfect smooth skin",
+                            "woman walking on the beach, sunset, orange sky",
+                            "woman in a white apron and chef hat, garnishing a gourmet dish, full body view, long shot",
+                            "woman dancing pose among folks in a park, waving hands",
+                            "woman in iron man costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot",
+                            "woman jedi wielding a lightsaber, star wars, full body view, eye level shot",
+                            "woman playing guitar on a boat, ocean waves",
+                            "woman with a passion for reading, curled up with a book in a cozy nook near a window",
+                            "woman running pose in a park, eye level shot",
+                            "woman in superman costume flying pose, the sky ablaze with hues of orange and purple, full body view, long shot"
+                       ])
+            submit = gr.Button("Submit", variant="primary")
+            negative_prompt = gr.Textbox(
+                label="Negative Prompt",
+                value="flaws in the eyes, flaws in the face, lowres, non-HDRi, low quality, worst quality, artifacts, noise, text, watermark, glitch, mutated, ugly, disfigured, hands, partially rendered objects, partially rendered eyes, deformed eyeballs, cross-eyed, blurry, mutation, duplicate, out of frame, cropped, mutilated, bad anatomy, deformed, bad proportions, nude, naked, nsfw, topless, bare breasts",
+            )
+            adaface_id_cfg_scale = gr.Slider(
+                    label="AdaFace CFG Scale",
+                    info="The CFG scale of the AdaFace ID embeddings (influencing fine facial features)",
+                    minimum=0.5,
+                    maximum=8.0,
+                    step=0.5,
+                    value=4.0,
+                )
+            guidance_scale = gr.Slider(
+                label="Guidance scale",
+                minimum=0.5,
+                maximum=8.0,
+                step=0.5,
+                value=4.0,
+            )
+            num_images = gr.Slider(
+                label="Number of output images",
+                minimum=1,
+                maximum=6,
+                step=1,
+                value=4,
+            )
+            seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=MAX_SEED,
+                step=1,
+                value=0,
+            )
+            randomize_seed = gr.Checkbox(label="Randomize seed", value=True, info="Uncheck for reproducible results")
+        with gr.Column():
+            out_gallery = gr.Gallery(label="Generated Images", columns=2, rows=2, height=600)
+        img_files.upload(fn=swap_to_gallery, inputs=img_files, outputs=[uploaded_files_gallery, clear_button_column, img_files])
+        remove_and_reupload.click(fn=remove_back_to_files, outputs=[uploaded_files_gallery, clear_button_column, img_files])
+        submit.click(
+            fn=randomize_seed_fn,
+            inputs=[seed, randomize_seed],
+            outputs=seed,
+            queue=False,
+            api_name=False,
+        ).then(
+            fn=generate_image,
+            inputs=[img_files, guidance_scale, adaface_id_cfg_scale, num_images, prompt, negative_prompt, seed],
+            outputs=[out_gallery]
+        ).then(
+            fn=update_out_gallery,
+            inputs=[out_gallery],
+            outputs=[out_gallery]
+        )
+demo.launch(share=True, server_name=args.ip, ssl_verify=False)

arc2face_models.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import torch
+import torch.nn as nn
+from transformers import CLIPTextModel
+from transformers.models.clip.modeling_clip import CLIPAttention
+from typing import Any, Callable, Dict, Optional, Tuple, Union, List
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+# from transformers.models.clip.modeling_clip import _make_causal_mask, _expand_mask
+_make_causal_mask = AttentionMaskConverter._make_causal_mask
+_expand_mask = AttentionMaskConverter._expand_mask
+from adaface.util import add_noise_to_tensor
+# Extend CLIPAttention by using multiple k_proj and v_proj in each head.
+# To avoid too much increase of computation, we don't extend q_proj.
+class CLIPAttentionMKV(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config, multiplier=2):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.multiplier  = multiplier
+        self.k_proj   = nn.Linear(self.embed_dim, self.embed_dim * self.multiplier)
+        self.v_proj   = nn.Linear(self.embed_dim, self.embed_dim * self.multiplier)
+        self.q_proj   = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+    # The (approximately) repeated token features are repeated along the last dim in tensor
+    # (multiplier * num_heads * head_dim), and then reshaped to (bsz, -1, num_heads, head_dim).
+    # Therefore, the "multiplier" dim is tucked into the seq_len dim, which looks like
+    # [token1_emb, token1_emb, token2_emb, token2_emb, ..., tokenN_emb, tokenN_emb].
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def extend_weights(self, clip_attn_layer, layer_idx, multiplier, noise_std=0.1,
+                       noise_std_is_relative=True, keep_norm=False, verbose=False):
+        self.multiplier *= multiplier
+        # q_proj and out_proj are the same as the original CLIPAttention.
+        self.q_proj.weight.data   = clip_attn_layer.q_proj.weight.data.clone()
+        self.q_proj.bias.data     = clip_attn_layer.q_proj.bias.data.clone()
+        self.out_proj.weight.data = clip_attn_layer.out_proj.weight.data.clone()
+        self.out_proj.bias.data   = clip_attn_layer.out_proj.bias.data.clone()
+        # bias doesn't need noise perturbation, as after the weights are noised,
+        # different copies of the weight/bias will receive different gradients,
+        # making the bias terms diverge and identifiable after training.
+        self.v_proj.bias.data     = clip_attn_layer.v_proj.bias.data.repeat(multiplier)
+        self.k_proj.bias.data     = clip_attn_layer.k_proj.bias.data.repeat(multiplier)
+        self.v_proj.weight.data   = clip_attn_layer.v_proj.weight.data.repeat(multiplier, 1)
+        self.k_proj.weight.data   = clip_attn_layer.k_proj.weight.data.repeat(multiplier, 1)
+        if noise_std > 0:
+            ORIG_V_SHAPE    = list(clip_attn_layer.v_proj.weight.shape)
+            ORIG_V_SHAPE_D0 = ORIG_V_SHAPE[0]
+            # Adding noise to the extra copies of the weights (keep the first copy unchanged).
+            self.v_proj.weight.data[ORIG_V_SHAPE_D0:] = \
+                add_noise_to_tensor(self.v_proj.weight.data[ORIG_V_SHAPE_D0:],
+                                    noise_std, noise_std_is_relative, keep_norm)
+            if verbose:
+                NEW_V_SHAPE     = list(self.v_proj.weight.shape)
+                NOISED_V_SHAPE  = list(self.v_proj.weight.data[ORIG_V_SHAPE_D0:].shape)
+                print(f"Layer {layer_idx}: {NOISED_V_SHAPE} in {NEW_V_SHAPE} of v_proj is added with {noise_std} noise")
+            ORIG_K_SHAPE    = list(clip_attn_layer.k_proj.weight.shape)
+            ORIG_K_SHAPE_D0 = ORIG_K_SHAPE[0]
+            # Adding noise to the extra copies of the weights.
+            self.k_proj.weight.data[ORIG_K_SHAPE_D0:] = \
+                add_noise_to_tensor(self.k_proj.weight.data[ORIG_K_SHAPE_D0:],
+                                    noise_std, noise_std_is_relative, keep_norm)
+            if verbose:
+                NEW_K_SHAPE     = list(self.k_proj.weight.shape)
+                NOISED_K_SHAPE  = list(self.k_proj.weight.data[ORIG_K_SHAPE_D0:].shape)
+                print(f"Layer {layer_idx}: {NOISED_K_SHAPE} in {NEW_K_SHAPE} of k_proj is added with {noise_std} noise")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        query_states = self.q_proj(hidden_states) * self.scale
+        # For key_states and value_states, the multiplier is absorbed into the seq_len (dim 1, shape specified as -1).
+        # [token0_head_emb, token0_head_emb, token1_head_emb, token1_head_emb, ..., tokenN-1_head_emb, tokenN-1_head_emb].
+        key_states   = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states   = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+        src_len = key_states.size(1)
+        # src_len0 is the original src_len without the multiplier.
+        src_len0 = src_len // self.multiplier
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len0):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len0)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            # The last dim of attn_weights corresponds to [token0, token0, token1, token1, ..., tokenN-1, tokenN-1].
+            # If reshaping it as (self.multiplier, src_len0), it will become
+            # [[token0, token0, token1, token1, ..., tokenN//2], [tokenN//2+1, tokenN//2+1, ..., tokenN-1, tokenN-1]],
+            # and the mask will be applied to wrong elements.
+            # If reshaping it as (src_len0, self.multiplier), it will become
+            # [[token0, token1, ..., tokenN-1], [token0, token1, ..., tokenN-1]], and then
+            # the mask at element i will mask all the multiplier elements at i, which is desired.
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len0, self.multiplier) + causal_attention_mask.unsqueeze(4)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len0):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len0)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len0, self.multiplier) + attention_mask.unsqueeze(4)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped
+class CLIPTextModelWrapper(CLIPTextModel):
+    # Adapted from https://github.com/huggingface/transformers/blob/v4.34.1/src/transformers/models/clip/modeling_clip.py#L812
+    # Modified to accept precomputed token embeddings "input_token_embs" as input or calculate them from input_ids and return them.
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        input_token_embs: Optional[torch.Tensor] = None,
+        hidden_state_layer_weights: Optional[torch.Tensor] = None,
+        return_token_embs: Optional[bool] = False,
+    ) -> Union[Tuple, torch.Tensor, BaseModelOutputWithPooling]:
+        if return_token_embs:
+            return self.text_model.embeddings.token_embedding(input_ids)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.text_model.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.text_model.config.output_hidden_states
+        )
+        if hidden_state_layer_weights is not None:
+            output_hidden_states = True
+        return_dict = return_dict if return_dict is not None else self.text_model.config.use_return_dict
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        hidden_states = self.text_model.embeddings(input_ids=input_ids, position_ids=position_ids, inputs_embeds=input_token_embs)
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _make_causal_mask(input_shape, hidden_states.dtype, device=hidden_states.device)
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+        encoder_outputs = self.text_model.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            # output_hidden_states is False by default, and only True if hidden_state_layer_weights is provided.
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # If output_hidden_states is True, then encoder_outputs[0] is last_hidden_state [1, 22, 768].
+        # encoder_outputs[1] is hidden_states, which is a tuple of 13 hidden states, each being [1, 22, 768].
+        # encoder_outputs[0] == encoder_outputs[1][12].
+        if hidden_state_layer_weights is None:
+            last_hidden_state = encoder_outputs[0]
+        else:
+            num_hidden_state_layers = len(hidden_state_layer_weights)
+            last_hidden_states = encoder_outputs[1][-num_hidden_state_layers:]
+            hidden_state_layer_weights = hidden_state_layer_weights.to(last_hidden_states[0].dtype)
+            # Normalize the weights of to sum to 1 across layers.
+            # hidden_state_layer_weights: [3, 1] or [3, 768].
+            hidden_state_layer_weights = hidden_state_layer_weights / hidden_state_layer_weights.sum(dim=0, keepdim=True)
+            # [3, 1/768] -> [3, 1, 1, 1/768]
+            hidden_state_layer_weights = hidden_state_layer_weights.unsqueeze(1).unsqueeze(1)
+            # A weighted sum of last_hidden_states.
+            # [3, 1, 22, 768] * [3, 1, 1, 1/768] -> [3, 1, 22, 768] -> [1, 22, 768]
+            last_hidden_state = (torch.stack(last_hidden_states, dim=0) * hidden_state_layer_weights).sum(dim=0)
+        last_hidden_state = self.text_model.final_layer_norm(last_hidden_state)
+        # self.text_model.eos_token_id == 2 is True.
+        if self.text_model.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
+            ]
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
+                (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.text_model.eos_token_id)
+                .int()
+                .argmax(dim=-1),
+            ]
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+    # Applied to layers [begin_layer_idx, end_layer_idx) in the encoder.
+    # The layer indexed by end_layer_idx is not included.
+    # If both layer indices are -1, then apply to all layers (0-11).
+    def extend_clip_attention_MKV_multiplier(self, begin_layer_idx=-1, end_layer_idx=-1, multiplier=2, noise_std=0.1):
+        num_extended_layers = 0
+        for layer_idx, layer in enumerate(self.text_model.encoder.layers):
+            if begin_layer_idx >= 0 and layer_idx < begin_layer_idx:
+                continue
+            if end_layer_idx >= 0 and layer_idx >= end_layer_idx:
+                break
+            # This shouldn't happen, unless self_attn has already been extended as CLIPAttentionMKV.
+            if not isinstance(layer.self_attn, (CLIPAttention, CLIPAttentionMKV)):
+                breakpoint()
+            old_attn_layer = layer.self_attn
+            if not isinstance(old_attn_layer, CLIPAttentionMKV):
+                layer.self_attn = CLIPAttentionMKV(old_attn_layer.config, 1)
+            layer.self_attn.extend_weights(old_attn_layer, layer_idx, multiplier, noise_std, verbose=True)
+            num_extended_layers += 1
+        return num_extended_layers

models/adaface/subjects-celebrity2024-05-16T17-22-46_zero3-ada-30000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4aa1eb9ff3e364ea1b9db6dfff0c281ff3b57864d7ccc4c64d5f29ed752484f3
+size 821700521

models/arc2face/arc2face/config.json ADDED Viewed

	@@ -0,0 +1,67 @@

+{
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.22.0",
+  "act_fn": "silu",
+  "addition_embed_type": null,
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": null,
+  "attention_head_dim": 8,
+  "attention_type": "default",
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "conv_out_kernel": 3,
+  "cross_attention_dim": 768,
+  "cross_attention_norm": null,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dropout": 0.0,
+  "dual_cross_attention": false,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossAttn",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_out_scale_factor": 1.0,
+  "resnet_skip_time_act": false,
+  "resnet_time_scale_shift": "default",
+  "reverse_transformer_layers_per_block": null,
+  "sample_size": 64,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "transformer_layers_per_block": 1,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ],
+  "upcast_attention": false,
+  "use_linear_projection": false
+}

models/arc2face/arc2face/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2377c16b7135650ca375817a4812a999194fba1f081e39117bd54e50dacc784
+size 3438167536

models/arc2face/encoder/config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "architectures": [
+    "CLIPTextModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dropout": 0.0,
+  "eos_token_id": 2,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 768,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 77,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "projection_dim": 768,
+  "torch_dtype": "float32",
+  "transformers_version": "4.34.1",
+  "vocab_size": 49408
+}

models/arc2face/encoder/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e2d364df774b7d3975f85de42bda73c0c0cdb952273dd5f138511b6cf65424aa
+size 492308829

models/insightface/models/antelopev2/1k3d68.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df5c06b8a0c12e422b2ed8947b8869faa4105387f199c477af038aa01f9a45cc
+size 143607619

models/insightface/models/antelopev2/2d106det.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f001b856447c413801ef5c42091ed0cd516fcd21f2d6b79635b1e733a7109dbf
+size 5030888

models/insightface/models/antelopev2/arcface.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ec639a0429b4819130d1405a2d3b38beaa4cc4a6c5bd9cf48b94fdf65461de83
+size 260694151

models/insightface/models/antelopev2/genderage.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4fde69b1c810857b88c64a335084f1c3fe8f01246c9a191b48c7bb756d6652fb
+size 1322532

models/insightface/models/antelopev2/scrfd_10g_bnkps.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5838f7fe053675b1c7a08b633df49e7af5495cee0493c7dcf6697200b85b5b91
+size 16923827

models/insightface/models/buffalo_l/1k3d68.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df5c06b8a0c12e422b2ed8947b8869faa4105387f199c477af038aa01f9a45cc
+size 143607619

models/insightface/models/buffalo_l/2d106det.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f001b856447c413801ef5c42091ed0cd516fcd21f2d6b79635b1e733a7109dbf
+size 5030888

models/insightface/models/buffalo_l/det_10g.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5838f7fe053675b1c7a08b633df49e7af5495cee0493c7dcf6697200b85b5b91
+size 16923827

models/insightface/models/buffalo_l/genderage.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4fde69b1c810857b88c64a335084f1c3fe8f01246c9a191b48c7bb756d6652fb
+size 1322532

models/insightface/models/buffalo_l/w600k_r50.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c06341c33c2ca1f86781dab0e829f88ad5b64be9fba56e56bc9ebdefc619e43
+size 174383860

models/sar/sar.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35a5d7615850879ffecce7b1e463ae0317c95fe784dd9b179793b58531a9e3ab
+size 2299982596

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+torch
+torchvision
+einops
+gradio
+transformers
+insightface
+opencv-python
+diffusers
+onnx>=1.16.0
+onnxruntime
+safetensors
+spaces

subj_basis_generator.py ADDED Viewed

	@@ -0,0 +1,758 @@

+# Borrowed from ip-adapter resampler.py.
+# https://github.com/tencent-ailab/IP-Adapter/blob/main/ip_adapter/resampler.py
+# modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
+# and https://github.com/lucidrains/imagen-pytorch/blob/main/imagen_pytorch/imagen_pytorch.py
+import math
+import torch
+from torch import nn
+import torch.nn.functional as F
+from einops import rearrange
+from einops.layers.torch import Rearrange
+from transformers import CLIPVisionModel, CLIPTokenizer
+import numpy as np
+from torch import einsum
+from dataclasses import dataclass
+from typing import Optional, Tuple
+from transformers.utils import ModelOutput
+from adaface.util import arc2face_inverse_face_prompt_embs, gen_gradient_scaler
+from adaface.arc2face_models import CLIPTextModelWrapper
+import sys
+sys.modules['ldm'] = sys.modules['adaface']
+def reshape_tensor(x, num_heads):
+    bs, length, width = x.shape
+    # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, num_heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+    x = x.reshape(bs, num_heads, length, -1)
+    return x
+# FFN. Added a Dropout layer at the end, so that it can still load the old ckpt.
+def FeedForward(dim, mult=4, p_dropout=0.1):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+        nn.Dropout(p_dropout),
+    )
+# IP-Adapter FaceID class. Only used in knn-faces.py.
+# From: https://github.com/tencent-ailab/IP-Adapter/blob/main/ip_adapter/ip_adapter_faceid_separate.py
+class IP_MLPProjModel(nn.Module):
+    def __init__(self, cross_attention_dim=768, id_embeddings_dim=512, num_tokens=4):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.num_tokens = num_tokens
+        self.proj = nn.Sequential(
+            nn.Linear(id_embeddings_dim, id_embeddings_dim*2),
+            nn.GELU(),
+            nn.Linear(id_embeddings_dim*2, cross_attention_dim*num_tokens),
+        )
+        self.norm = nn.LayerNorm(cross_attention_dim)
+    def forward(self, id_embeds):
+        x = self.proj(id_embeds)
+        x = x.reshape(-1, self.num_tokens, self.cross_attention_dim)
+        x = self.norm(x)
+        return x
+# group_dim: the tensor dimension that corresponds to the multiple groups.
+class LearnedSoftAggregate(nn.Module):
+    def __init__(self, num_feat, group_dim, keepdim=False):
+        super(LearnedSoftAggregate, self).__init__()
+        self.group_dim  = group_dim
+        # num_feat = 1: element-wise score function & softmax.
+        # num_feat > 1: the linear score function is applied to the last dim (features) of the input tensor.
+        self.num_feat   = num_feat
+        self.feat2score = nn.Linear(num_feat, 1, bias=False)
+        self.keepdim    = keepdim
+    def forward(self, x, score_basis=None):
+        # If there's only one mode, do nothing.
+        if x.shape[self.group_dim] == 1:
+            if self.keepdim:
+                return x
+            else:
+                return x.squeeze(self.group_dim)
+        # Assume the last dim of x is the feature dim.
+        if score_basis is None:
+            score_basis = x
+        if self.num_feat == 1:
+            mode_scores = self.feat2score(score_basis.unsqueeze(-1)).squeeze(-1)
+        else:
+            mode_scores = self.feat2score(score_basis)
+        attn_probs  = mode_scores.softmax(dim=self.group_dim)
+        x_aggr      = (x * attn_probs).sum(dim=self.group_dim, keepdim=self.keepdim)
+        return x_aggr
+def LoRA_ExpandEmbs(input_dim, lora_rank, output_dim, num_modes,
+                    num_output_vecs, elementwise_affine=True, p_dropout=0.1):
+    return nn.Sequential(
+        # Project to [BS, lora_rank * output_dim * num_modes].
+        # It takes a huge param size. 512 * 32 * 768 * 4 = 6,291,456.
+        nn.Linear(input_dim, lora_rank * output_dim * num_modes, bias=False),
+        # Reshape to [BS, lora_rank, output_dim].
+        Rearrange('b (m q d) -> b m q d', q=lora_rank, m=num_modes, d=output_dim),
+        nn.LayerNorm(output_dim, elementwise_affine=elementwise_affine),
+        # Aggregate [BS, num_modes, loar_rank, output_dim] -> [BS, lora_rank, output_dim].
+        LearnedSoftAggregate(num_feat=output_dim, group_dim=1, keepdim=False) if num_modes > 1 \
+            else Rearrange('b () q d -> b q d'),
+        nn.Dropout(p_dropout),
+        # Permute to [BS, output_dim, lora_rank].
+        Rearrange('b q d -> b d q'),
+        # Project to [BS, output_dim, num_output_vecs].
+        nn.Linear(lora_rank, num_output_vecs, bias=False),
+        # Permute to [BS, num_output_vecs, output_dim].
+        Rearrange('b d q -> b q d'),
+        nn.LayerNorm(output_dim, elementwise_affine=elementwise_affine),
+        nn.Dropout(p_dropout),
+    )
+def ExpandEmbs(input_dim, output_dim, expansion_ratio, elementwise_affine=True, p_dropout=0.1):
+    return nn.Sequential(
+        # Project to [BS, num_output_vecs * output_dim].
+        nn.Linear(input_dim, expansion_ratio * output_dim, bias=False),
+        # Reshape to [BS, num_output_vecs, output_dim].
+        Rearrange('b (e d) -> b e d', e=expansion_ratio, d=output_dim),
+        nn.LayerNorm(output_dim, elementwise_affine=elementwise_affine),
+        nn.Dropout(p_dropout),
+    )
+# Input: [BS, N, D].
+def MultimodeProjection(input_dim, output_dim=-1, num_modes=4, elementwise_affine=True, p_dropout=0.1):
+    if output_dim == -1:
+        output_dim = input_dim
+    return nn.Sequential(
+            nn.Linear(input_dim, output_dim * num_modes, bias=False),
+            # Reshape to [BS, num_output_vecs, output_dim].
+            Rearrange('b n (m d) -> b n m d', m=num_modes, d=output_dim),
+            nn.LayerNorm(output_dim, elementwise_affine=elementwise_affine),
+            # If num_modes == 1, then simply remove the mode dim. Otherwise, aggregate the modes.
+            LearnedSoftAggregate(num_feat=output_dim, group_dim=2, keepdim=False) if num_modes > 1 \
+                else Rearrange('b n () d -> b n d'),
+            nn.Dropout(p_dropout),
+    )
+# Low-rank to high-rank transformation.
+def Lora2Hira(lora_rank, hira_rank, output_dim, num_modes, elementwise_affine=True, p_dropout=0.1):
+    return nn.Sequential(
+        # Permute to [BS, output_dim, lora_rank].
+        Rearrange('b q d -> b d q'),
+        # Project to [BS, output_dim, hira_rank].
+        nn.Linear(lora_rank, hira_rank * num_modes, bias=False),
+        # Reshape and permute to [BS, num_modes, num_output_vecs, output_dim].
+        Rearrange('b d (m q) -> b m q d', m=num_modes, q=hira_rank),
+        nn.LayerNorm(output_dim, elementwise_affine=elementwise_affine),
+        # Aggregate [BS, num_modes, hira_rank, output_dim] -> [BS, hira_rank, output_dim].
+        LearnedSoftAggregate(num_feat=output_dim, group_dim=1, keepdim=False) if num_modes > 1 \
+            else Rearrange('b () q d -> b q d'),
+        nn.Dropout(p_dropout),
+    )
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, num_heads=8, elementwise_affine=True):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.dim_head = dim_head
+        self.num_heads = num_heads
+        inner_dim = dim_head * num_heads
+        self.norm1 = nn.LayerNorm(dim, elementwise_affine=elementwise_affine)
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=elementwise_affine)
+        self.to_q   = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv  = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+    def forward(self, x, latent_queries):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latent_queries = self.norm2(latent_queries)
+        b, l, _ = latent_queries.shape
+        q = self.to_q(latent_queries)
+        kv_input = torch.cat((x, latent_queries), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        q = reshape_tensor(q, self.num_heads)
+        k = reshape_tensor(k, self.num_heads)
+        v = reshape_tensor(v, self.num_heads)
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1)  # More stable with f16 than dividing afterwards
+        attn = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = attn @ v
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+        return self.to_out(out)
+class CrossAttention(nn.Module):
+    # output_dim is always the same as input_dim.
+    # num_q only matters when q_aware_to_v is True.
+    # If q_aware_to_v is False, query x in forward() is still usable.
+    def __init__(self, input_dim, num_heads=6, p_dropout=0.05,
+                 identity_to_q=False, identity_to_k=False, identity_to_v=False, v_has_skip=True,
+                 q_aware_to_v=True, num_q=416, v_repeat=4, q_aware_to_v_lora_rank=64,
+                 identity_to_out=False, out_has_skip=False):
+        super().__init__()
+        dim_head  = input_dim // num_heads
+        inner_dim = dim_head   * num_heads
+        self.num_heads      = num_heads
+        self.q_aware_to_v   = q_aware_to_v
+        self.v_has_skip     = v_has_skip
+        self.to_q = nn.Sequential(
+                        nn.Linear(input_dim, inner_dim, bias=False),
+                        nn.LayerNorm(inner_dim, elementwise_affine=True)
+                    ) if not identity_to_q else nn.Identity()
+        self.to_k = nn.Sequential(
+                        nn.Linear(input_dim, inner_dim, bias=False),
+                        nn.LayerNorm(inner_dim, elementwise_affine=True)
+                    ) if not identity_to_k else nn.Identity()
+        self.v_repeat = v_repeat
+        self.num_q_group = num_q_group = num_q // v_repeat      # 416 / 4 = 104.
+        # If q_aware_to_v is True, then self.to_v consists of num_q projections of input_dim to inner_dim.
+        # Otherwise, self.to_v consists of a single projection of input_dim to inner_dim.
+        if q_aware_to_v:
+            # all_q_mid: 104 * 64 = 6656.
+            all_q_mid = num_q_group * q_aware_to_v_lora_rank
+            self.to_v = nn.Sequential(
+                # number of params: 768 * 6656 = 5,111,808.
+                # Input:  [BS, 16, 768]. Output: [BS, 16, 104*64] = [BS, 16, 6656].
+                # Each 768-dim vec is dispersed into 104 64-dim vecs.
+                nn.Linear(input_dim, all_q_mid, bias=False),
+                nn.LayerNorm(all_q_mid, elementwise_affine=True),
+                # Change the dim of the tensor to [BS, 6656, 16], as Conv1d transforms dim 1.
+                Rearrange('b n q -> b q n', q=all_q_mid),
+                # Each q_aware_to_v projection has its own linear layer.
+                # The total number of parameters will be 6656*768 = 5,111,808.
+                # Output: [BS, 104*768, 16]. Each 64 dim feature is expanded to 768 dim.
+                nn.Conv1d(
+                    in_channels=all_q_mid,
+                    out_channels=num_q_group * input_dim,
+                    kernel_size=1,
+                    groups=num_q_group,
+                    bias=False,
+                ),
+                # Output: [BS, 104, 16, 768].
+                Rearrange('b (q d) n -> b q n d', q=num_q_group, d=input_dim),
+                nn.LayerNorm(input_dim, elementwise_affine=True),
+            )
+        else:
+            self.to_v = nn.Sequential(
+                            nn.Linear(input_dim, inner_dim, bias=False),
+                            nn.LayerNorm(inner_dim, elementwise_affine=True)
+                        ) if not identity_to_v else nn.Identity()
+        if identity_to_out:
+            assert not out_has_skip, "identity_to_out=True, then out_has_skip has to be False."
+        if identity_to_out:
+            self.to_out = nn.Identity()
+        else:
+            self.to_out = nn.Sequential(
+                nn.Linear(input_dim, input_dim, bias=False),
+                nn.Dropout(p_dropout),
+                nn.LayerNorm(inner_dim, elementwise_affine=True)
+            )
+        self.out_has_skip = out_has_skip
+        self.attn_drop = nn.Dropout(p_dropout)
+    def forward(self, x, context=None, attn_mat=None, return_attn=False):
+        h = self.num_heads
+        if context is None:
+            context = x
+        if attn_mat is None:
+            # q: [BS, Q, D] -> [BS, Q, D].
+            q = self.to_q(x)
+            # k: [BS, L, D] -> [BS, L, D].
+            k = self.to_k(context)
+            # q: [6, 512, 128], k: [6, 17, 128].
+            q, k = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k))
+        if self.q_aware_to_v:
+            # context: [BS, L, D]. v: [BS, Q, L, D].
+            # There are effectively Q to_v projections.
+            v = self.to_v(context)
+            if self.v_has_skip:
+                v = v + context.unsqueeze(1)
+        else:
+            # v: [BS, L, D].
+            v = self.to_v(context)
+            if self.v_has_skip:
+                v = v + context
+        #print(v.shape)
+        if self.q_aware_to_v:
+            # v: [6, 64, 17, 128].
+            # v is query-specific, so there's an extra dim for the query.
+            v = rearrange(v, 'b q n (h d) -> (b h) q n d', h=h)
+            # Each v is for a query group with 512/64 = 8 queries.
+            # So each v is repeated 8 times to match the number of queries.
+            # v: [6, 64, 17, 128] -> [6, 512, 17, 128].
+            v = v.repeat(1, self.v_repeat, 1, 1)
+        else:
+            v = rearrange(v, 'b n (h d) -> (b h) n d', h=h)
+        if attn_mat is None:
+            scale = q.size(-1) ** -0.25
+            sim = einsum('b i d, b j d -> b i j', q * scale, k * scale)
+            # sim: [6, 64, 17]. 6: bs 1 * h 6.
+            # attention, what we cannot get enough of
+            # NOTE: the normalization is done across tokens, not across pixels.
+            # So for each pixel, the sum of attention scores across tokens is 1.
+            attn = sim.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            #print(attn.std())
+        else:
+            attn = attn_mat
+        if self.q_aware_to_v:
+            # attn: [6, 32, 17]. v: [6, 32, 17, 128]. 128: dim of each head. out: [6, 32, 128].
+            # out is combined with different attn weights and v for different queries.
+            out = einsum('b i j, b i j d -> b i d', attn, v)
+        else:
+            # v: [6, 17, 128]. out: [6, 32, 128].
+            out = einsum('b i j, b j d -> b i d',   attn, v)
+        # [6, 32, 128] -> [1, 32, 768].
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+        if self.out_has_skip:
+            out = self.to_out(out) + out
+        else:
+            out = self.to_out(out)
+        if return_attn:
+            return out, attn
+        else:
+            return out
+class SubjBasisGenerator(nn.Module):
+    def __init__(
+        self,
+        # number of cross-attention heads. Half of the number of heads 12 of OpenAI clip-vit-large-patch14:
+        # https://huggingface.co/openai/clip-vit-large-patch14/blob/main/config.json
+        num_heads=6,
+        num_id_vecs={ 'subj': 77, 'bg': 257 }, # number of identity vectors. 18: 16 face tokens + 2 extra tokens. 257: 257 CLIP tokens.
+        num_out_embs_per_layer=4,             # num_out_embs. subj: 16. bg: 4.
+        num_out_layers=16,                    # number of layers of output embeddings.
+        image_embedding_dim=768,              # CLIP image feature dimension, as per config.json above.
+        # DINO vits16 has 6 attention heads:
+        # https://huggingface.co/facebook/dino-vits16/blob/main/config.json
+        dino_embedding_dim=384,             # DINO object feature dimension for objects.
+        output_dim=768,                     # CLIP text embedding input dimension.
+        placeholder_is_bg: bool = False,    # Whether the placeholder is for the image background.
+        prompt2token_proj_grad_scale: float = 0.4,  # Gradient scale for prompt2token_proj.
+        zs_extra_words_scale: float = 0.5,     # Scale for extra words in the prompt2token_proj.
+        learnable_hidden_state_weights_scheme: str = 'per-layer',  # none, per-layer.
+        bg_prompt_translator_has_to_out_proj: bool = False,  # Whether the prompt_trans_layers have a to_out projection.
+    ):
+        super().__init__()
+        self.placeholder_is_bg      = placeholder_is_bg
+        self.num_out_layers         = num_out_layers
+        self.num_out_embs_per_layer = num_out_embs_per_layer
+        # subj: 64, bg: 32.
+        self.num_out_embs           = num_out_layers * num_out_embs_per_layer
+        self.output_dim             = output_dim
+        # num_id_vecs should be the number of core ID embs, 16.
+        # However, in such case, pos_embs is not used. So it doesn't matter if it's wrongly set.
+        self.num_id_vecs = num_id_vecs['bg'] if placeholder_is_bg else num_id_vecs['subj']
+        self.pos_embs    = nn.Parameter(torch.randn(1, self.num_id_vecs, output_dim))
+        self.pos_embs_ln = nn.LayerNorm(output_dim)
+        self.zs_extra_words_scale = zs_extra_words_scale
+        self.output_scale           = output_dim ** -0.5
+        self.clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        if not self.placeholder_is_bg:
+            # [1, 384] -> [1, 16, 768].
+            # TODO: use CLIPTextModelWrapper as obj_proj_in.
+            self.obj_proj_in = ExpandEmbs(dino_embedding_dim, output_dim, expansion_ratio=self.num_id_vecs)
+            # self.prompt2token_proj: [1, 16, 768] -> [1, 77, 768] (with paddings).
+            # If self.placeholder_is_bg: prompt2token_proj is set to None.
+            self.prompt2token_proj  = CLIPTextModelWrapper.from_pretrained('openai/clip-vit-large-patch14')
+            self.prompt2token_proj_grad_scale = prompt2token_proj_grad_scale
+            self.prompt2token_proj_grad_scaler = gen_gradient_scaler(prompt2token_proj_grad_scale)
+            print(f"Subj prompt2token_proj initialized with grad scale of {prompt2token_proj_grad_scale}.")
+            # Freeze prompt2token_proj if prompt2token_proj_grad_scale is 0.
+            # Set requires_grad to False for all parameters in prompt2token_proj, to save memory taken by the optimizer.
+            if prompt2token_proj_grad_scale == 0:
+                self.freeze_prompt2token_proj()
+            self.prompt2token_proj_attention_multiplier = -1
+            self.initialize_hidden_state_layer_weights(learnable_hidden_state_weights_scheme, 'cpu')
+            self.pad_embeddings = None
+            self.bg_proj_in = None
+        else:
+            # For background placeholders, face and object embeddings are not used as they are foreground.
+            self.obj_proj_in  = None
+            self.prompt2token_proj = None
+            print("Bg prompt2token_proj is set to None.")
+            self.bg_proj_in = nn.Sequential(
+                nn.Linear(image_embedding_dim, output_dim, bias=False),
+                nn.LayerNorm(output_dim),
+            )
+            self.latent_queries     = nn.Parameter(torch.randn(1, self.num_out_embs, output_dim))
+            self.latent_queries_ln  = nn.LayerNorm(output_dim)
+            self.bg_prompt_translator_has_to_out_proj = bg_prompt_translator_has_to_out_proj
+            identity_to_v   = False
+            v_has_skip      = not identity_to_v                         # True
+            identity_to_out = not bg_prompt_translator_has_to_out_proj  # True
+            out_has_skip    = not identity_to_out                       # False
+            # prompt_translator has a to_v projection with skip connection, and doesn't have a to_out projection.
+            # dim=768, num_heads=6.
+            self.prompt_translator = \
+                CrossAttention(input_dim=output_dim, num_heads=num_heads, p_dropout=0.05,
+                                identity_to_q=False, identity_to_k=False, identity_to_v=identity_to_v,
+                                q_aware_to_v=False,  v_has_skip=v_has_skip,
+                                num_q=0, # When not q_aware_to_v, num_q is not referenced.
+                                identity_to_out=identity_to_out,
+                                out_has_skip=out_has_skip)
+            '''
+            prompt_translator: CLIPEncoder
+            # https://github.com/huggingface/transformers/blob/1872bde7fc6a5d6796bd742bc2dc38eaf8069c5d/src/transformers/models/clip/modeling_clip.py#L566
+            # CLIPEncoder.layers: 12 layers of CLIPEncoderLayer, each being
+                (0): CLIPEncoderLayer(
+                    (self_attn): CLIPAttention(
+                        (k_proj): Linear(in_features=768, out_features=768, bias=True)
+                        (v_proj): Linear(in_features=768, out_features=768, bias=True)
+                        (q_proj): Linear(in_features=768, out_features=768, bias=True)
+                        (out_proj): Linear(in_features=768, out_features=768, bias=True)
+                    )
+                    (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                    (mlp): CLIPMLP(
+                        (activation_fn): QuickGELUActivation()
+                        (fc1): Linear(in_features=768, out_features=3072, bias=True)
+                        (fc2): Linear(in_features=3072, out_features=768, bias=True)
+                    )
+                    (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
+                )
+            '''
+        print(repr(self))
+    # raw_id_embs: ArcFace embeddings for faces (not used since we have arc2face_id_embs),
+    # or DINO embeddings for objects.
+    # arc2face_id_embs: [BS, 16, 768], the core identity embeddings generated by Arc2Face.
+    def forward(self, arc2face_id_embs, clip_features=None, raw_id_embs=None, out_id_embs_scale=1.0,
+                is_face=True, is_training=False, adaface_prompt_embs_inf_type='full_half_pad'):
+        if not self.placeholder_is_bg:
+            BS = arc2face_id_embs.shape[0]
+        else:
+            # If bg, then arc2face_id_embs is set to None, but clip_features is not None.
+            BS = clip_features.shape[0]
+        adaface_prompt_embs = None
+        if not hasattr(self, 'clip_tokenizer'):
+            self.clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        # No need to use raw_id_embs if placeholder_is_bg.
+        if not self.placeholder_is_bg:
+            if is_face:
+                assert arc2face_id_embs is not None
+                # arc2face_embs has been projected to the (modified) prompt embedding space
+                # by arc2face_forward_face_embs. This prompt embedding space is modified because Arc2Face finetuned
+                # the text encoder and the U-Net.
+                # in embedding_manager: [BS, 16, 768] -> [BS, 77, 768].
+                # arc2face_id_embs is part of arc2face_embs: [BS, 77, 768] -> [BS, 16, 768].
+                # adaface_prompt_embs is projected to the prompt embedding spaces. This is the
+                # original U-Net prompt embedding space.
+                # hidden_state_layer_weights: [[0.9163], [0.9483], [2.0762]]
+                hidden_state_layer_weights = self.hidden_state_layer_weights_grad_scaler(self.hidden_state_layer_weights)
+                # return_emb_types: a list of strings, each string is among
+                # ['full', 'core', 'full_pad', 'full_half_pad', 'full_zeroed_extra', 'b_core_e'].
+                # Using b_core_e is more computationally efficient than using full_zeroed_extra.
+                # But there is an unknow BUG that causes crash when using b_core_e.
+                if is_training:
+                    return_emb_types = ['full_pad', 'core']
+                else:
+                    # adaface_prompt_embs_inf_type: default is full_half_pad, same as training.
+                    return_emb_types = [adaface_prompt_embs_inf_type, 'core']
+                if self.pad_embeddings is None:
+                    self.generate_pad_embeddings()
+                else:
+                    self.pad_embeddings = self.pad_embeddings.to(arc2face_id_embs.device)
+                with torch.set_grad_enabled(self.training and self.prompt2token_proj_grad_scale != 0):
+                    # If list_extra_words is not None, then core_id_embs: [BS, 18, 768], three leading words, the 16 identity tokens
+                    # and (at most) two extra words in full_prompt_embs, without BOS and EOS.
+                    # If list_extra_words is None, then core_id_embs: [BS, 16, 768], the 16 identity tokens in full_prompt_embs.
+                    # hidden_state_layer_weights: [[0.9163], [0.9483], [2.0762]]
+                    # zs_extra_words_scale is only effective when list_extra_words is not None.
+                    # adaface_prompt_embs: [BS, 77, 768], core_id_embs: [BS, 16, 768].
+                    adaface_prompt_embs, core_id_embs = \
+                        arc2face_inverse_face_prompt_embs(self.clip_tokenizer,
+                                                          self.prompt2token_proj,
+                                                          arc2face_id_embs,
+                                                          list_extra_words=None,
+                                                          return_emb_types=return_emb_types,
+                                                          pad_embeddings=self.pad_embeddings,
+                                                          hidden_state_layer_weights=hidden_state_layer_weights,
+                                                          input_max_length=77, zs_extra_words_scale=self.zs_extra_words_scale)
+                # Reduce the update rate to prompt2token_proj.
+                adaface_prompt_embs = self.prompt2token_proj_grad_scaler(adaface_prompt_embs)
+                core_id_embs = self.prompt2token_proj_grad_scaler(core_id_embs)
+            elif raw_id_embs is not None:
+                # id_embs: [BS, 384] -> [BS, 18, 768].
+                # obj_proj_in is expected to project the DINO object features to
+                # the token embedding space. So no need to use prompt2token_proj.
+                id_embs = self.obj_proj_in(raw_id_embs)
+            else:
+                breakpoint()
+        else:
+            # Otherwise, context is the ad-hoc CLIP image features.
+            # id_embs: [BS, 257, 768].
+            id_embs = self.bg_proj_in(clip_features)
+        if self.placeholder_is_bg:
+            id_embs = id_embs + self.pos_embs_ln(self.pos_embs)
+            latent_queries = self.latent_queries_ln(self.latent_queries).repeat(BS, 1, 1)
+            # If bg, we don't have to use a specific attn layer for each 4-vec set. Instead, one attn layer can generate 257 embs,
+            # and we take the first 16*4=64.
+            # Output of prompt_translator is exactly num_out_embs == 64 tokens. id_embs_out: [BS, 64, 768].
+            # prompt_translator: better named as bg_prompt_translator. It maps the bg features
+            # to bg prompt embeddings.
+            with torch.set_grad_enabled(self.training):
+                id_embs_out = self.prompt_translator(latent_queries, id_embs)
+            # [BS, 64, 768] -> [BS, 16, 4, 768]
+            id_embs_out = id_embs_out.reshape(BS, self.num_out_layers, -1, self.output_dim)
+            adaface_subj_embs = id_embs_out * self.output_scale    # * 0.036
+        else:
+            # adaface_subj_embs: [BS, 16, 768] -> [BS, 1, 16, 768] -> [BS, 16, 16, 768]
+            adaface_subj_embs = core_id_embs.unsqueeze(1).repeat(1, self.num_out_layers, 1, 1)
+        # If out_id_embs_scale < 1, adaface_subj_embs is a mix of adaface_subj_embs and pad_embeddings.
+        if out_id_embs_scale != 1:
+            # pad_embeddings: [77, 768] -> [16, 768] -> [1, 1, 16, 768].
+            pad_embeddings = self.pad_embeddings[4:4+self.num_out_embs_per_layer].unsqueeze(0).unsqueeze(0)
+            adaface_subj_embs =   adaface_subj_embs * out_id_embs_scale \
+                                + pad_embeddings    * (1 - out_id_embs_scale)
+        return adaface_subj_embs, adaface_prompt_embs
+    def initialize_hidden_state_layer_weights(self, learnable_hidden_state_weights_scheme, device):
+        if learnable_hidden_state_weights_scheme == 'none':
+            self.hidden_state_layer_weights = None
+            # A grad scaler with alpha =1 is nn.Identity(), which outputs None given None as input.
+            self.hidden_state_layer_weights_grad_scaler = gen_gradient_scaler(1)
+            print("hidden_state_layer_weights is set to None.")
+        elif learnable_hidden_state_weights_scheme == 'per-layer':
+            # Learnable weights of the last 3 layers, initialized to putting more focus on the last layer.
+            # 'per-layer': Different weights for different layers, but the same for different channels.
+            # hidden_state_layer_weights: [3, 1].
+            self.hidden_state_layer_weights = nn.Parameter(torch.tensor([[1.0], [2.0], [4.0]], device=device),
+                                                            requires_grad=True)
+            self.hidden_state_layer_weights_grad_scaler = gen_gradient_scaler(5)
+            print("hidden_state_layer_weights initialized as per-layer [1, 2, 4], with grad scaler 5.")
+        else:
+            breakpoint()
+    def generate_pad_embeddings(self):
+        # clip_embeddings: CLIPTextEmbeddings instance. pad_embeddings is generated after
+        # prompt2token_proj is loaded from the finetuned weight. It seems such pad embeddings perform
+        # slightly better than the original pad embeddings.
+        clip_embeddings = self.prompt2token_proj.text_model.embeddings
+        # clip_embeddings() and clip_embeddings.token_embedding() differ in that
+        # clip_embeddings() adds positional embeddings, while clip_embeddings.token_embedding() doesn't.
+        # Adding positional embeddings seems to help somewhat.
+        # pad_tokens: pad_token_id 49407 repeated 77 times.
+        # pad_token_id is the EOS token. But BOS is 49406.
+        pad_tokens = torch.tensor([self.clip_tokenizer.pad_token_id]).to(clip_embeddings.token_embedding.weight.device).repeat(77)
+        # pad_embeddings: [77, 768].
+        pad_embeddings = clip_embeddings(pad_tokens)[0]
+        # We don't allow face recon to influence the pad embeddings.
+        # Otherwise, face identity will leak into the pad embeddings.
+        self.pad_embeddings = pad_embeddings.detach()
+    def extend_prompt2token_proj_attention(self, begin_layer_idx=-1, end_layer_idx=-1, multiplier=2, noise_std=0.1):
+        if multiplier > 1:
+            num_extended_layers = self.prompt2token_proj.extend_clip_attention_MKV_multiplier(begin_layer_idx, end_layer_idx, multiplier, noise_std)
+            self.prompt2token_proj_attention_multiplier = multiplier
+            print(f"{num_extended_layers} layers in prompt2token_proj_attention are x{multiplier}")
+    def freeze_prompt2token_proj(self):
+        # If bg, then prompt2token_proj is set to None. Therefore no need to freeze it.
+        # Then we don't have to check whether it's for subj or bg.
+        if self.prompt2token_proj is not None:
+            frozen_param_names = []
+            for param_name, param in self.prompt2token_proj.named_parameters():
+                if param.requires_grad:
+                    param.requires_grad = False
+                    frozen_param_names.append(param_name)
+                # If param is already frozen, then no need to freeze it again.
+            print(f"{len(frozen_param_names)} params in Subj prompt2token_proj is frozen.")
+            #print(f"Frozen parameters:\n{frozen_param_names}")
+    def __repr__(self):
+        type_sig = 'subj' if not self.placeholder_is_bg else 'bg'
+        # Fix compatability with the previous version.
+        if not hasattr(self, 'bg_prompt_translator_has_to_out_proj'):
+            self.bg_prompt_translator_has_to_out_proj = False
+        if not hasattr(self, 'num_out_embs'):
+            self.num_out_embs = -1
+        return f"{type_sig} SubjBasisGenerator: num_out_embs={self.num_out_embs}, " \
+               f"bg_prompt_translator_has_to_out_proj={self.bg_prompt_translator_has_to_out_proj}"
+@dataclass
+class BaseModelOutputWithPooling2(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token) after further processing
+            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
+            the classification token after processing through a linear layer and a tanh activation function. The linear
+            layer weights are trained from the next sentence prediction (classification) objective during pretraining.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    last_hidden_state: torch.FloatTensor = None
+    pooler_output: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    attn_mask: Optional[torch.FloatTensor] = None
+# Revised from CLIPVisionTransformer to support attention mask.
+# self: a CLIPVisionTransformer instance.
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_clip.py#L821
+# pixel_values: preprocessed B*C*H*W images. [BS, 3, 224, 224]
+# attn_mask: B*H*W attention mask.
+def CLIPVisionTransformer_forward(self, pixel_values = None, attn_mask=None,
+                                  output_attentions = None,
+                                  output_hidden_states = None, return_dict = None):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        # Visual tokens are flattended in embeddings().
+        # self.embeddings: CLIPVisionEmbeddings.
+        # hidden_states: [BS, 257, 1280]. 257: 16*16 (patch_embeds) + 1 (class_embeds).
+        # 16*16 is output from Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), bias=False).
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+        if attn_mask is not None:
+            # feat_edge_size: 16.
+            feat_edge_size = np.sqrt(hidden_states.shape[1] - 1).astype(int)
+            # attn_mask: [BS, 512, 512] -> [BS, 1, 16, 16].
+            attn_mask = F.interpolate(attn_mask.unsqueeze(1), size=(feat_edge_size, feat_edge_size), mode='nearest')
+            # Flatten the mask: [BS, 1, 16, 16] => [BS, 1, 256].
+            attn_mask = attn_mask.flatten(2)
+            # Prepend 1 to the mask: [BS, 1, 256] => [BS, 1, 257].
+            # This 1 corresponds to class_embeds, which is always attended to.
+            attn_mask = torch.cat([torch.ones_like(attn_mask[:, :, :1]), attn_mask], dim=-1)
+            attn_mask_pairs = torch.matmul(attn_mask.transpose(-1, -2), attn_mask).unsqueeze(1)
+        else:
+            attn_mask_pairs = None
+        # encoder: CLIPEncoder.
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            # New feature: (***The official documentation is wrong***)
+            # attention_mask (`torch.Tensor` of shape `(batch_size, 1, sequence_length, sequence_length)`, *optional*):
+            #                 Mask to avoid performing attention on pairs of token. Mask values selected in `[0, 1]`:
+            #                 - 1 for pairs that are **not masked**,
+            #                 - 0 for pairs that are **masked**.
+            # attention_mask is eventually used by CLIPEncoderLayer:
+            # https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_clip.py#L370
+            attention_mask=attn_mask_pairs,
+            output_attentions=output_attentions,        # False
+            output_hidden_states=output_hidden_states,  # True
+            return_dict=return_dict,                    # True
+        )
+        # last_hidden_state: [BS, 257, 1280]
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+        # return_dict is True.
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling2(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            # Newly added: return resized flattened attention mask.
+            # [BS, 1, 257] -> [BS, 257, 1]
+            attn_mask=attn_mask.permute(0, 2, 1) if attn_mask is not None else None
+        )
+class CLIPVisionModelWithMask(CLIPVisionModel):
+    def __init__(self, config):
+        super().__init__(config)
+        # Replace vision_model.forward() with the new one that supports mask.
+        self.vision_model.forward = CLIPVisionTransformer_forward.__get__(self.vision_model)
+    def forward(self, pixel_values = None, attn_mask = None, output_attentions = None,
+                output_hidden_states = None, return_dict = None):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        return self.vision_model(
+            pixel_values=pixel_values,
+            attn_mask=attn_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )

util.py ADDED Viewed

	@@ -0,0 +1,342 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+import cv2
+# add_noise_to_tensor() adds a fixed amount of noise to the tensor.
+def add_noise_to_tensor(ts, noise_std, noise_std_is_relative=True, keep_norm=False,
+                        std_dim=-1, norm_dim=-1):
+    if noise_std_is_relative:
+        ts_std_mean = ts.std(dim=std_dim).mean().detach()
+        noise_std *= ts_std_mean
+    noise = torch.randn_like(ts) * noise_std
+    if keep_norm:
+        orig_norm = ts.norm(dim=norm_dim, keepdim=True)
+        ts = ts + noise
+        new_norm  = ts.norm(dim=norm_dim, keepdim=True).detach()
+        ts = ts * orig_norm / (new_norm + 1e-8)
+    else:
+        ts = ts + noise
+    return ts
+# Revised from RevGrad, by removing the grad negation.
+class ScaleGrad(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input_, alpha_, debug=False):
+        ctx.save_for_backward(alpha_, debug)
+        output = input_
+        if debug:
+            print(f"input: {input_.abs().mean().item()}")
+        return output
+    @staticmethod
+    def backward(ctx, grad_output):  # pragma: no cover
+        # saved_tensors returns a tuple of tensors.
+        alpha_, debug = ctx.saved_tensors
+        if ctx.needs_input_grad[0]:
+            grad_output2 = grad_output * alpha_
+            if debug:
+                print(f"grad_output2: {grad_output2.abs().mean().item()}")
+        else:
+            grad_output2 = None
+        return grad_output2, None, None
+class GradientScaler(nn.Module):
+    def __init__(self, alpha=1., debug=False, *args, **kwargs):
+        """
+        A gradient scaling layer.
+        This layer has no parameters, and simply scales the gradient in the backward pass.
+        """
+        super().__init__(*args, **kwargs)
+        self._alpha = torch.tensor(alpha, requires_grad=False)
+        self._debug = torch.tensor(debug, requires_grad=False)
+    def forward(self, input_):
+        _debug = self._debug if hasattr(self, '_debug') else False
+        return ScaleGrad.apply(input_, self._alpha.to(input_.device), _debug)
+def gen_gradient_scaler(alpha, debug=False):
+    if alpha == 1:
+        return nn.Identity()
+    if alpha > 0:
+        return GradientScaler(alpha, debug=debug)
+    else:
+        assert alpha == 0
+        # Don't use lambda function here, otherwise the object can't be pickled.
+        return torch.detach
+#@torch.autocast(device_type="cuda")
+# In AdaFaceWrapper, input_max_length is 22.
+def arc2face_forward_face_embs(tokenizer, arc2face_text_encoder, face_embs,
+                               input_max_length=77, return_full_and_core_embs=True):
+    '''
+    arc2face_text_encoder: arc2face_models.py CLIPTextModelWrapper instance.
+    face_embs: (N, 512) normalized ArcFace embeddings.
+    return_full_and_core_embs: Return both the full prompt embeddings and the core embeddings.
+                               If False, return only the core embeddings.
+    '''
+    # arcface_token_id: 1014
+    arcface_token_id = tokenizer.encode("id", add_special_tokens=False)[0]
+    # This step should be quite fast, and there's no need to cache the input_ids.
+    input_ids = tokenizer(
+            "photo of a id person",
+            truncation=True,
+            padding="max_length",
+            max_length=input_max_length, #tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids.to(face_embs.device)
+    # input_ids: [1, 77] or [3, 77] (during training).
+    input_ids = input_ids.repeat(len(face_embs), 1)
+    face_embs_dtype = face_embs.dtype
+    face_embs = face_embs.to(arc2face_text_encoder.dtype)
+    # face_embs_padded: [1, 512] -> [1, 768].
+    face_embs_padded = F.pad(face_embs, (0, arc2face_text_encoder.config.hidden_size - face_embs.shape[-1]), "constant", 0)
+    # arc2face_text_encoder(input_ids=input_ids, ...) is called twice. The first is only to get the token embeddings (the shallowest mapping).
+    # The second call does the ordinary CLIP text encoding pass.
+    token_embs = arc2face_text_encoder(input_ids=input_ids, return_token_embs=True)
+    token_embs[input_ids==arcface_token_id] = face_embs_padded
+    prompt_embeds = arc2face_text_encoder(
+        input_ids=input_ids,
+        input_token_embs=token_embs,
+        return_token_embs=False
+    )[0]
+    # Restore the original dtype of prompt_embeds: float16 -> float32.
+    prompt_embeds = prompt_embeds.to(face_embs_dtype)
+    if return_full_and_core_embs:
+        # token 4: 'id' in "photo of a id person".
+        # 4:20 are the most important 16 embeddings that contain the subject's identity.
+        # [N, 77, 768] -> [N, 16, 768]
+        return prompt_embeds, prompt_embeds[:, 4:20]
+    else:
+        # [N, 16, 768]
+        return prompt_embeds[:, 4:20]
+def get_b_core_e_embeddings(prompt_embeds, length=22):
+    b_core_e_embs = torch.cat([ prompt_embeds[:, :length], prompt_embeds[:, [-1]] ], dim=1)
+    return b_core_e_embs
+# return_emb_types: a list of strings, each string is among ['full', 'core', 'full_zeroed_extra', 'b_core_e'].
+def arc2face_inverse_face_prompt_embs(clip_tokenizer, inverse_text_encoder, face_prompt_embs, list_extra_words,
+                                      return_emb_types, pad_embeddings, hidden_state_layer_weights=None,
+                                      input_max_length=77, zs_extra_words_scale=0.5):
+    '''
+    inverse_text_encoder: arc2face_models.py CLIPTextModelWrapper instance with **custom weights**.
+    inverse_text_encoder is NOT the original arc2face text encoder, but retrained to do inverse mapping.
+    face_prompt_embs: (BS, 16, 768). Only the core embeddings, no paddings.
+    list_extra_words: [s_1, ..., s_BS], each s_i is a list of extra words to be added to the prompt.
+    return_full_and_core_embs: Return both the full prompt embeddings and the core embeddings.
+                               If False, return only the core embeddings.
+    '''
+    if list_extra_words is not None:
+        if len(list_extra_words) != len(face_prompt_embs):
+            if len(face_prompt_embs) > 1:
+                print("Warn: list_extra_words has different length as face_prompt_embs.")
+                if len(list_extra_words) == 1:
+                    list_extra_words = list_extra_words * len(face_prompt_embs)
+                else:
+                    breakpoint()
+            else:
+                # len(face_prompt_embs) == 1, this occurs when same_subject_in_batch == True, e.g. in do_mix_prompt_distillation.
+                # But list_extra_words always corresponds to the actual batch size. So we only take the first element.
+                list_extra_words = list_extra_words[:1]
+        for extra_words in list_extra_words:
+            assert len(extra_words.split()) <= 2, "Each extra_words string should consist of at most 2 words."
+        # 16 ", " are placeholders for face_prompt_embs.
+        prompt_templates = [ "photo of a " + ", " * 16 + list_extra_words[i] for i in range(len(list_extra_words)) ]
+    else:
+        # 16 ", " are placeholders for face_prompt_embs.
+        # No extra words are added to the prompt.
+        prompt_templates = [ "photo of a " + ", " * 16 for _ in range(len(face_prompt_embs)) ]
+    # This step should be quite fast, and there's no need to cache the input_ids.
+    # input_ids: [BS, 77].
+    input_ids = clip_tokenizer(
+            prompt_templates,
+            truncation=True,
+            padding="max_length",
+            max_length=input_max_length,
+            return_tensors="pt",
+        ).input_ids.to(face_prompt_embs.device)
+    face_prompt_embs_dtype  = face_prompt_embs.dtype
+    face_prompt_embs        = face_prompt_embs.to(inverse_text_encoder.dtype)
+    # token_embs: [1, 77, 768]. This call is only to get the template token embeddings (the shallowest mapping).
+    token_embs = inverse_text_encoder(input_ids=input_ids, return_token_embs=True)
+    # token 4: first ", " in the template prompt.
+    # Replace embeddings of 16 placeholder ", " with face_prompt_embs.
+    token_embs[:, 4:20] = face_prompt_embs
+    # This call does the ordinary CLIP text encoding pass.
+    prompt_embeds = inverse_text_encoder(
+        input_ids=input_ids,
+        input_token_embs=token_embs,
+        hidden_state_layer_weights=hidden_state_layer_weights,
+        return_token_embs=False
+    )[0]
+    # Restore the original dtype of prompt_embeds: float16 -> float32.
+    prompt_embeds = prompt_embeds.to(face_prompt_embs_dtype)
+    # token 4: first ", " in the template prompt.
+    # 4:20 are the most important 16 embeddings that contain the subject's identity.
+    # 20:22 are embeddings of the (at most) two extra words.
+    # [N, 77, 768] -> [N, 16, 768]
+    core_prompt_embs = prompt_embeds[:, 4:20]
+    if list_extra_words is not None:
+        # [N, 16, 768] -> [N, 18, 768]
+        extra_words_embs = prompt_embeds[:, 20:22] * zs_extra_words_scale
+        core_prompt_embs = torch.cat([core_prompt_embs, extra_words_embs], dim=1)
+    return_prompts = []
+    for emb_type in return_emb_types:
+        if emb_type == 'full':
+            return_prompts.append(prompt_embeds)
+        elif emb_type == 'full_half_pad':
+            prompt_embeds2 = prompt_embeds.clone()
+            PADS  = prompt_embeds2.shape[1] - 23
+            if PADS >= 2:
+                # Fill half of the remaining embeddings with pad embeddings.
+                prompt_embeds2[:, 22:22+PADS//2] = pad_embeddings[22:22+PADS//2]
+            return_prompts.append(prompt_embeds2)
+        elif emb_type == 'full_pad':
+            prompt_embeds2 = prompt_embeds.clone()
+            # Fill the 22nd to the second last embeddings with pad embeddings.
+            prompt_embeds2[:, 22:-1] = pad_embeddings[22:-1]
+            return_prompts.append(prompt_embeds2)
+        elif emb_type == 'core':
+            return_prompts.append(core_prompt_embs)
+        elif emb_type == 'full_zeroed_extra':
+            prompt_embeds2 = prompt_embeds.clone()
+            # Only add two pad embeddings. The remaining embeddings are set to 0.
+            # Make the positional embeddings align with the actual positions.
+            prompt_embeds2[:, 22:24] = pad_embeddings[22:24]
+            prompt_embeds2[:, 24:-1] = 0
+            return_prompts.append(prompt_embeds2)
+        elif emb_type == 'b_core_e':
+            # The first 22 embeddings, plus the last EOS embedding.
+            b_core_e_embs = get_b_core_e_embeddings(prompt_embeds, length=22)
+            return_prompts.append(b_core_e_embs)
+        else:
+            breakpoint()
+    return return_prompts
+# if pre_face_embs is None, generate random face embeddings [BS, 512].
+# image_folder is passed only for logging purpose. image_paths contains the paths of the images.
+def get_arc2face_id_prompt_embs(face_app, clip_tokenizer, arc2face_text_encoder,
+                                extract_faceid_embeds, pre_face_embs,
+                                image_folder, image_paths, images_np,
+                                id_batch_size, device,
+                                input_max_length=77, noise_level=0.0,
+                                return_core_id_embs=False,
+                                gen_neg_prompt=False, verbose=False):
+    face_image_count = 0
+    if extract_faceid_embeds:
+        faceid_embeds = []
+        if image_paths is not None:
+            images_np = []
+            for image_path in image_paths:
+                image_np = np.array(Image.open(image_path))
+                images_np.append(image_np)
+        for i, image_np in enumerate(images_np):
+            image_obj = Image.fromarray(image_np).resize((512, 512), Image.NEAREST)
+            # Remove alpha channel if it exists.
+            if image_obj.mode == 'RGBA':
+                image_obj = image_obj.convert('RGB')
+            # This seems NOT a bug. The input image should be in BGR format, as per
+            # https://github.com/deepinsight/insightface/issues/524
+            image_np = cv2.cvtColor(np.array(image_obj), cv2.COLOR_RGB2BGR)
+            image_np = np.array(image_obj)
+            face_infos = face_app.get(image_np)
+            if verbose and image_paths is not None:
+                print(image_paths[i], len(face_infos))
+            # Assume all images belong to the same subject. Therefore, we can skip the images with no face detected.
+            if len(face_infos) == 0:
+                continue
+            # only use the maximum face
+            face_info = sorted(face_infos, key=lambda x:(x['bbox'][2]-x['bbox'][0])*x['bbox'][3]-x['bbox'][1])[-1]
+            # Each faceid_embed: [1, 512]
+            faceid_embeds.append(torch.from_numpy(face_info.normed_embedding).unsqueeze(0))
+            face_image_count += 1
+        if verbose:
+            if image_folder is not None:
+                print(f"Extracted ID embeddings from {face_image_count} images in {image_folder}")
+            else:
+                print(f"Extracted ID embeddings from {face_image_count} images")
+        if len(faceid_embeds) == 0:
+            print("No face detected. Use a random face instead.")
+            faceid_embeds = torch.randn(id_batch_size, 512).to(device=device, dtype=torch.float16)
+        else:
+            # faceid_embeds: [10, 512]
+            faceid_embeds = torch.cat(faceid_embeds, dim=0)
+            # faceid_embeds: [10, 512] -> [1, 512].
+            # and the resulted prompt embeddings are the same.
+            faceid_embeds = faceid_embeds.mean(dim=0, keepdim=True).to(device=device, dtype=torch.float16)
+    else:
+        # Random face embeddings. faceid_embeds: [BS, 512].
+        if pre_face_embs is None:
+            faceid_embeds = torch.randn(id_batch_size, 512)
+        else:
+            faceid_embeds = pre_face_embs
+            if pre_face_embs.shape[0] == 1:
+                faceid_embeds = faceid_embeds.repeat(id_batch_size, 1)
+        faceid_embeds = faceid_embeds.to(device=device, dtype=torch.float16)
+    if noise_level > 0:
+        # If id_batch_size > 1, after adding noises, the id_batch_size embeddings will be different.
+        faceid_embeds = add_noise_to_tensor(faceid_embeds, noise_level, noise_std_is_relative=True, keep_norm=True)
+    faceid_embeds = F.normalize(faceid_embeds, p=2, dim=-1)
+    # arc2face_pos_prompt_emb, arc2face_neg_prompt_emb: [BS, 77, 768]
+    with torch.no_grad():
+        arc2face_pos_prompt_emb, arc2face_pos_core_prompt_emb  = \
+             arc2face_forward_face_embs(clip_tokenizer, arc2face_text_encoder,
+                                        faceid_embeds, input_max_length=input_max_length,
+                                        return_full_and_core_embs=True)
+        if return_core_id_embs:
+            arc2face_pos_prompt_emb = arc2face_pos_core_prompt_emb
+    # If extract_faceid_embeds, we assume all images are from the same subject, and the batch dim of faceid_embeds is 1.
+    # So we need to repeat faceid_embeds.
+    if extract_faceid_embeds:
+        faceid_embeds = faceid_embeds.repeat(id_batch_size, 1)
+        arc2face_pos_prompt_emb = arc2face_pos_prompt_emb.repeat(id_batch_size, 1, 1)
+    if gen_neg_prompt:
+        with torch.no_grad():
+            arc2face_neg_prompt_emb, arc2face_neg_core_prompt_emb = \
+                arc2face_forward_face_embs(clip_tokenizer, arc2face_text_encoder,
+                                           torch.zeros_like(faceid_embeds),
+                                           input_max_length=input_max_length,
+                                           return_full_and_core_embs=True)
+            if return_core_id_embs:
+                arc2face_neg_prompt_emb = arc2face_neg_core_prompt_emb
+        #if extract_faceid_embeds:
+        #    arc2face_neg_prompt_emb = arc2face_neg_prompt_emb.repeat(id_batch_size, 1, 1)
+        return face_image_count, faceid_embeds, arc2face_pos_prompt_emb, arc2face_neg_prompt_emb
+    else:
+        return face_image_count, faceid_embeds, arc2face_pos_prompt_emb