m3face commited on Jul 17, 2024

Commit

332190f

1 Parent(s): 92a5ecd

Adding files

Browse files

Files changed (23) hide show

README.md +73 -0
data/landmarks.pickle +3 -0
data/landmarks/1.jpg +0 -0
data/landmarks/2.jpg +0 -0
data/landmarks/3.jpg +0 -0
data/landmarks/4.jpg +0 -0
data/masks/1.png +0 -0
data/masks/2.png +0 -0
data/masks/3.png +0 -0
data/masks/4.png +0 -0
docs/pull-figure.png +0 -0
edit.py +493 -0
generate.py +176 -0
requirements.txt +7 -0
utils/dml_csr/dml_csr.py +103 -0
utils/dml_csr/modules/ddgcn.py +182 -0
utils/dml_csr/modules/edges.py +66 -0
utils/dml_csr/modules/parsing.py +51 -0
utils/dml_csr/modules/util.py +58 -0
utils/dml_csr/transforms.py +122 -0
utils/mclip.py +70 -0
utils/plot_landmark.py +138 -0
utils/plot_mask.py +191 -0

README.md ADDED Viewed

	@@ -0,0 +1,73 @@

+---
+tags:
+- text-to-image
+- controlnet
+---
+# M<sup>3</sup>Face Model Card
+We introduce M<sup>3</sup>Face, a unified multi-modal multilingual framework for controllable face generation and editing. This framework enables users to utilize only text input to generate controlling modalities automatically, for instance, semantic segmentation or facial landmarks, and subsequently generate face images.
+## Getting Started
+### Installation
+1. Clone our repository:
+   ```bash
+   git clone https://huggingface.co/m3face/m3face
+   cd m3face
+   ```
+2. Install dependencies:
+   ```bash
+    pip install -r requirements.txt
+   ```
+### Resources
+- For face generation, VRAM of 10 GB+ for 512x512 images is required.
+- For face editing, VRAM of 14 GB+ for 512x512 images is required.
+### Pre-trained Models
+You can find the checkpoints for the ControlNet model at [`m3face/ControlnetModels`](https://huggingface.co/m3face/ControlnetModels) and the mask/landmark generator model at [`m3face/FaceConditioning`](https://huggingface.co/m3face/FaceConditioning).
+### M<sup>3</sup>CelebA Dataset
+The M<sup>3</sup>CelebA Dataset is available at [`m3face/M3CelebA`](https://huggingface.co/m3face/M3CelebA). You can view or download it from there.
+## Face Generation
+You can do face generation with text, segmentation mask, facial landmarks, or a combination of them by running the following command:
+```bash
+python generate.py --seed 1111 \
+                   --condition "landmark" \
+                   --prompt "This attractive woman has narrow eyes, rosy cheeks, and wears heavy makeup." \
+                   --save_condition
+```
+You can define the type of conditioning modality with `--condition`. By default, a conditioning modality will be generated by our framework and will be saved if the `--save_condition` argument is given. Otherwise, you can use your condition image with the `condition_path` argument.
+## Face Editing
+For face editing, you can run the following command:
+```bash
+python edit.py --enable_xformers_memory_efficient_attention \
+               --seed 1111 \
+               --condition "landmark" \
+               --prompt "She is a smiling." \
+               --image_path "/path/to/image" \
+               --condition_path "/path/to/condition" \
+               --edit_condition \
+               --embedding_optimize_it 500 \
+               --model_finetune_it 1000 \
+               --alpha 0.7 1 1.1 \
+               --num_inference_steps 30 \
+               --unet_layer "2and3"
+```
+You need to specify the input image and original conditioning modality. You can edit the face with an edit conditioning modality (specifying `--edit_condition_path`) or by editing the original conditioning modality with our framework (specifying `--edit_condition`).
+The `--unet_layer` argument specifies which UNet layers in the SD to finetune.
+> Note: If you don't have the original conditioning modality you can simply generate it using the `plot_mask.py` and `plot_landmark.py` scripts:
+```bash
+pip install git+https://github.com/mapillary/inplace_abn
+python utils/plot_mask.py --image_path "/path/to/image"
+python utils/plot_landmark.py --image_path "/path/to/image"
+```
+## Training
+The code and instruction for training our models will be posted soon!

data/landmarks.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e129223b20f017a389b04ffe65ac0fd047f03a2bd9ef5bcb9eb0358b2b50a85
+size 688

data/landmarks/1.jpg ADDED Viewed

data/landmarks/2.jpg ADDED Viewed

data/landmarks/3.jpg ADDED Viewed

data/landmarks/4.jpg ADDED Viewed

data/masks/1.png ADDED Viewed

data/masks/2.png ADDED Viewed

data/masks/3.png ADDED Viewed

data/masks/4.png ADDED Viewed

docs/pull-figure.png ADDED Viewed

edit.py ADDED Viewed

	@@ -0,0 +1,493 @@

+import os
+import argparse
+from tqdm.auto import tqdm
+from packaging import version
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torchvision import transforms
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    DDPMScheduler,
+    StableDiffusionControlNetPipeline,
+    UNet2DConditionModel,
+    UniPCMultistepScheduler,
+    PNDMScheduler,
+    AmusedInpaintPipeline, AmusedScheduler, VQModel, UVit2DModel
+)
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils import load_image
+from transformers import AutoTokenizer, CLIPFeatureExtractor, PretrainedConfig
+from PIL import Image
+from utils.mclip import *
+def parse_args():
+    parser = argparse.ArgumentParser(description="Edit images with M3Face.")
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="This attractive woman has narrow eyes, rosy cheeks, and wears heavy makeup.",
+        help="The input text prompt for image generation."
+    )
+    parser.add_argument(
+        "--condition",
+        type=str,
+        default="mask",
+        choices=["mask", "landmark"],
+        help="Use segmentation mask or facial landmarks for image generation."
+    )
+    parser.add_argument(
+        "--image_path",
+        type=str,
+        default=None,
+        help="Path to the input image."
+    )
+    parser.add_argument(
+        "--condition_path",
+        type=str,
+        default=None,
+        help="Path to the original mask/landmark image."
+    )
+    parser.add_argument(
+        "--edit_condition_path",
+        type=str,
+        default=None,
+        help="Path to the target mask/landmark image."
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default='output/',
+        help="The output directory where the results will be written.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible generation.")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument("--edit_condition", action="store_true")
+    parser.add_argument("--load_unet_from_local", action="store_true")
+    parser.add_argument("--save_unet", action="store_true")
+    parser.add_argument("--unet_local_path", type=str, default=None)
+    parser.add_argument("--load_finetune_from_local", action="store_true")
+    parser.add_argument("--finetune_path", type=str, default=None)
+    parser.add_argument("--use_english", action="store_true", help="Use the English models.")
+    parser.add_argument("--embedding_optimize_it", type=int, default=500)
+    parser.add_argument("--model_finetune_it", type=int, default=1000)
+    parser.add_argument("--alpha", nargs="+", type=float, default=[0.8, 0.9, 1, 1.1])
+    parser.add_argument("--num_inference_steps", nargs="+", type=int, default=[20, 40, 50])
+    parser.add_argument("--unet_layer", type=str, default="2and3",
+                        help="Which UNet layers in the SD to finetune.")
+    args = parser.parse_args()
+    return args
+def get_muse(args):
+    muse_model_name = 'm3face/FaceConditioning'
+    if args.condition == 'mask':
+        muse_revision = 'segmentation'
+    elif args.condition == 'landmark':
+        muse_revision = 'landmark'
+    scheduler = AmusedScheduler.from_pretrained(muse_model_name, revision=muse_revision, subfolder='scheduler')
+    vqvae = VQModel.from_pretrained(muse_model_name, revision=muse_revision, subfolder='vqvae')
+    uvit2 = UVit2DModel.from_pretrained(muse_model_name, revision=muse_revision, subfolder='transformer')
+    text_encoder = MultilingualCLIP.from_pretrained(muse_model_name, revision=muse_revision, subfolder='text_encoder')
+    tokenizer = AutoTokenizer.from_pretrained(muse_model_name, revision=muse_revision, subfolder='tokenizer')
+    pipeline = AmusedInpaintPipeline(
+        vqvae=vqvae,
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        transformer=uvit2,
+        scheduler=scheduler
+    ).to("cuda")
+    return pipeline
+def import_model_class_from_model_name(sd_model_name):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        sd_model_name,
+        subfolder="text_encoder",
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.deprecated.alt_diffusion import RobertaSeriesModelWithTransformation
+        return RobertaSeriesModelWithTransformation
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+def preprocess(image, condition, prompt, tokenizer):
+    image_transforms = transforms.Compose(
+        [
+            transforms.Resize(512, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(512),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+    condition_transforms = transforms.Compose(
+        [
+            transforms.Resize(512, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(512),
+            transforms.ToTensor(),
+        ]
+    )
+    image = image_transforms(image)
+    condition = condition_transforms(condition)
+    inputs = tokenizer(
+            [prompt], max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+    return image, condition, inputs.input_ids, inputs.attention_mask
+def main(args):
+    if args.use_english:
+        sd_model_name = 'runwayml/stable-diffusion-v1-5'
+        controlnet_model_name = 'm3face/ControlnetModels'
+        if args.condition == 'mask':
+            controlnet_revision = 'segmentation-english'
+        elif args.condition == 'landmark':
+            controlnet_revision = 'landmark-english'
+    else:
+        sd_model_name = 'BAAI/AltDiffusion-m18'
+        controlnet_model_name = 'm3face/ControlnetModels'
+        if args.condition == 'mask':
+            controlnet_revision = 'segmentation-mlin'
+        elif args.condition == 'landmark':
+            controlnet_revision = 'landmark-mlin'
+    # ========== set up models ==========
+    vae = AutoencoderKL.from_pretrained(sd_model_name, subfolder="vae")
+    tokenizer = AutoTokenizer.from_pretrained(sd_model_name, subfolder="tokenizer", use_fast=False)
+    text_encoder_cls = import_model_class_from_model_name(sd_model_name)
+    text_encoder = text_encoder_cls.from_pretrained(sd_model_name, subfolder="text_encoder")
+    noise_scheduler = DDPMScheduler.from_pretrained(sd_model_name, subfolder="scheduler")
+    if args.load_unet_from_local:
+        unet = UNet2DConditionModel.from_pretrained(args.unet_local_path)
+    else:
+        unet = UNet2DConditionModel.from_pretrained(sd_model_name, subfolder="unet")
+    controlnet = ControlNetModel.from_pretrained(controlnet_model_name, revision=controlnet_revision)
+    if args.edit_condition:
+        muse = get_muse(args)
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    controlnet.requires_grad_(False)
+    unet.requires_grad_(False)
+    vae.eval()
+    text_encoder.eval()
+    controlnet.eval()
+    unet.eval()
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                print(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+            controlnet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+    # ========== select params to optimize ==========
+    params = []
+    for name, param in unet.named_parameters():
+        if(name.startswith('up_blocks')):
+            params.append(param)
+    if args.unet_layer == 'only1': # 116 layers
+        params_to_optimize = [
+            {'params': params[38:154]},
+        ]
+    elif args.unet_layer == 'only2': # 116 layers
+        params_to_optimize = [
+            {'params': params[154:270]},
+        ]
+    elif args.unet_layer == 'only3': # 114 layers
+        params_to_optimize = [
+            {'params': params[270:]},
+        ]
+    elif args.unet_layer == '1and2': # 232 layers
+        params_to_optimize = [
+            {'params': params[38:270]},
+        ]
+    elif args.unet_layer == '2and3': # 230 layers
+        params_to_optimize = [
+            {'params': params[154:]},
+        ]
+    elif args.unet_layer == 'all': # all layers
+        params_to_optimize = [
+            {'params': params},
+        ]
+    image = Image.open(args.image_path).convert('RGB')
+    condition = Image.open(args.condition_path).convert('RGB')
+    image, condition, input_ids, attention_mask = preprocess(image, condition, args.prompt, tokenizer)
+    # Move to device
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    vae.to(device, dtype=torch.float32)
+    unet.to(device, dtype=torch.float32)
+    text_encoder.to(device, dtype=torch.float32)
+    controlnet.to(device)
+    image = image.to(device).unsqueeze(0)
+    condition = condition.to(device).unsqueeze(0)
+    input_ids = input_ids.to(device)
+    attention_mask = attention_mask.to(device)
+    # ========== imagic ==========
+    if args.load_finetune_from_local:
+        print('Loading embeddings from local ...')
+        orig_emb = torch.load(os.path.join(args.finetune_path, 'orig_emb.pt'))
+        emb = torch.load(os.path.join(args.finetune_path, 'emb.pt'))
+    else:
+        init_latent = vae.encode(image.to(dtype=torch.float32)).latent_dist.sample()
+        init_latent = init_latent * vae.config.scaling_factor
+        if not args.use_english:
+            orig_emb = text_encoder(input_ids, attention_mask=attention_mask)[0]
+        else:
+            orig_emb = text_encoder(input_ids)[0]
+        emb = orig_emb.clone()
+        torch.save(orig_emb, os.path.join(args.output_dir, 'orig_emb.pt'))
+        torch.save(emb, os.path.join(args.output_dir, 'emb.pt'))
+        # 1. Optimize the embedding
+        print('1. Optimize the embedding')
+        unet.eval()
+        emb.requires_grad = True
+        lr = 0.001
+        it = args.embedding_optimize_it # 500
+        opt = torch.optim.Adam([emb], lr=lr)
+        history = []
+        pbar = tqdm(
+            range(it),
+            initial=0,
+            desc="Optimize Steps",
+        )
+        global_step = 0
+        for i in pbar:
+            opt.zero_grad()
+            noise = torch.randn_like(init_latent)
+            bsz = init_latent.shape[0]
+            t_enc = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=init_latent.device)
+            t_enc = t_enc.long()
+            z = noise_scheduler.add_noise(init_latent, noise, t_enc)
+            controlnet_image = condition.to(dtype=torch.float32)
+            down_block_res_samples, mid_block_res_sample = controlnet(
+                z,
+                t_enc,
+                encoder_hidden_states=emb,
+                controlnet_cond=controlnet_image,
+                return_dict=False,
+            )
+            # Predict the noise residual
+            pred_noise = unet(
+                z,
+                t_enc,
+                encoder_hidden_states=emb,
+                down_block_additional_residuals=[
+                    sample.to(dtype=torch.float32) for sample in down_block_res_samples
+                ],
+                mid_block_additional_residual=mid_block_res_sample.to(dtype=torch.float32),
+            ).sample
+            # Get the target for loss depending on the prediction type
+            if noise_scheduler.config.prediction_type == "epsilon":
+                target = noise
+            elif noise_scheduler.config.prediction_type == "v_prediction":
+                target = noise_scheduler.get_velocity(init_latent, noise, t_enc)
+            else:
+                raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+            loss = F.mse_loss(pred_noise.float(), target.float(), reduction="mean")
+            loss.backward()
+            global_step += 1
+            pbar.set_postfix({"loss": loss.item()})
+            history.append(loss.item())
+            opt.step()
+            opt.zero_grad()
+        # 2. Finetune the model
+        print('2. Finetune the model')
+        emb.requires_grad = False
+        unet.requires_grad_(True)
+        unet.train()
+        lr = 5e-5
+        it = args.model_finetune_it # 1000
+        opt = torch.optim.Adam(params_to_optimize, lr=lr)
+        history = []
+        pbar = tqdm(
+            range(it),
+            initial=0,
+            desc="Finetune Steps",
+        )
+        global_step = 0
+        for i in pbar:
+            opt.zero_grad()
+            noise = torch.randn_like(init_latent)
+            bsz = init_latent.shape[0]
+            t_enc = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=init_latent.device)
+            t_enc = t_enc.long()
+            z = noise_scheduler.add_noise(init_latent, noise, t_enc)
+            controlnet_image = condition.to(dtype=torch.float32)
+            down_block_res_samples, mid_block_res_sample = controlnet(
+                z,
+                t_enc,
+                encoder_hidden_states=emb,
+                controlnet_cond=controlnet_image,
+                return_dict=False,
+            )
+            # Predict the noise residual
+            pred_noise = unet(
+                z,
+                t_enc,
+                encoder_hidden_states=emb,
+                down_block_additional_residuals=[
+                    sample.to(dtype=torch.float32) for sample in down_block_res_samples
+                ],
+                mid_block_additional_residual=mid_block_res_sample.to(dtype=torch.float32),
+            ).sample
+            # Get the target for loss depending on the prediction type
+            if noise_scheduler.config.prediction_type == "epsilon":
+                target = noise
+            elif noise_scheduler.config.prediction_type == "v_prediction":
+                target = noise_scheduler.get_velocity(init_latent, noise, t_enc)
+            else:
+                raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+            loss = F.mse_loss(pred_noise.float(), target.float(), reduction="mean")
+            loss.backward()
+            global_step += 1
+            pbar.set_postfix({"loss": loss.item()})
+            history.append(loss.item())
+            opt.step()
+            opt.zero_grad()
+    # 3. Generate Images
+    print("3. Generating images... ")
+    unet.eval()
+    controlnet.eval()
+    if args.edit_condition_path is None:
+        edit_condition = load_image(args.condition_path)
+    else:
+        edit_condition = load_image(args.edit_condition_path)
+    if args.edit_condition:
+        edit_mask = Image.new("L", (256, 256), 0)
+        for i in range(256):
+            for j in range(256):
+                if 40 < i < 220 and 20 < j < 256:
+                    edit_mask.putpixel((i, j), 256)
+        if args.condition == 'mask':
+            condition = 'segmentation'
+        elif args.condition == 'landmark':
+            condition = 'landmark'
+        edit_prompt = f"Generate face {condition} | " + args.prompt
+        input_image = edit_condition.resize((256, 256)).convert("RGB")
+        edit_condition = muse(edit_prompt, input_image, edit_mask, num_inference_steps=30).images[0].resize((512, 512))
+        edit_condition.save(f'{args.output_dir}/edited_condition.png')
+        # remove muse and empty cache
+        del muse
+        torch.cuda.empty_cache()
+    if sd_model_name.startswith('BAAI'):
+        scheduler = PNDMScheduler.from_pretrained(
+            sd_model_name,
+            subfolder='scheduler',
+        )
+        scheduler = UniPCMultistepScheduler.from_config(scheduler.config)
+        feature_extractor = CLIPFeatureExtractor.from_pretrained(
+            sd_model_name,
+            subfolder='feature_extractor',
+        )
+        pipeline = StableDiffusionControlNetPipeline(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=None,
+            feature_extractor=feature_extractor
+        )
+    else:
+        pipeline = StableDiffusionControlNetPipeline.from_pretrained(
+            sd_model_name,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            safety_checker=None,
+        )
+        pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config)
+    pipeline = pipeline.to(device)
+    pipeline.set_progress_bar_config(disable=True)
+    if args.enable_xformers_memory_efficient_attention:
+        pipeline.enable_xformers_memory_efficient_attention()
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=device).manual_seed(args.seed)
+    with torch.autocast("cuda"):
+        image = pipeline(
+                image=edit_condition, prompt_embeds=emb, num_inference_steps=20, generator=generator
+                ).images[0]
+        image.save(f'{args.output_dir}/reconstruct.png')
+    # Interpolate the embedding
+    for num_inference_steps in args.num_inference_steps:
+        for alpha in args.alpha:
+            new_emb = alpha * orig_emb + (1 - alpha) * emb
+            with torch.autocast("cuda"):
+                image = pipeline(
+                        image=edit_condition, prompt_embeds=new_emb, num_inference_steps=num_inference_steps, generator=generator
+                    ).images[0]
+                image.save(f'{args.output_dir}/image_{num_inference_steps}_{alpha}.png')
+    if args.save_unet:
+        print('Saving the unet model...')
+        unet.save_pretrained(f'{args.output_dir}/unet')
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)

generate.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import argparse, os, time
+import torch
+from diffusers import (
+    AutoencoderKL,
+    ControlNetModel,
+    StableDiffusionControlNetPipeline,
+    UNet2DConditionModel,
+    UniPCMultistepScheduler,
+    PNDMScheduler,
+    AmusedPipeline, AmusedScheduler, VQModel, UVit2DModel
+)
+from transformers import AutoTokenizer, CLIPFeatureExtractor
+from diffusers.pipelines.deprecated.alt_diffusion import RobertaSeriesModelWithTransformation
+from diffusers.utils import load_image
+from utils.mclip import *
+def parse_args():
+    parser = argparse.ArgumentParser(description="Generate images with M3Face.")
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="This attractive woman has narrow eyes, rosy cheeks, and wears heavy makeup.",
+        help="The input text prompt for image generation."
+    )
+    parser.add_argument(
+        "--condition",
+        type=str,
+        default="mask",
+        choices=["mask", "landmark"],
+        help="Use segmentation mask or facial landmarks for image generation."
+    )
+    parser.add_argument(
+        "--condition_path",
+        type=str,
+        default=None,
+        help="Path to the condition mask/landmark image. We will generate the condition if it is not given."
+    )
+    parser.add_argument("--save_condition", action="store_true", help="Save the generated condition image.")
+    parser.add_argument("--use_english", action="store_true", help="Use the English models.")
+    parser.add_argument("--enhance_prompt", action="store_true", help="Enhance the given text prompt.")
+    parser.add_argument("--num_inference_steps", type=int, default=30)
+    parser.add_argument("--num_samples", type=int, default=1)
+    parser.add_argument(
+        "--additional_prompt",
+        type=str,
+        default="rim lighting, dslr, ultra quality, sharp focus, dof, Fujifilm XT3, crystal clear, highly detailed glossy eyes, high detailed skin, skin pores, 8K UHD"
+    )
+    parser.add_argument(
+        "--negative_prompt",
+        type=str,
+        default="low quality, bad quality, worst quality, blurry, disfigured, ugly, immature, cartoon, painting"
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible generation.")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output/",
+        help="The output directory where the results will be written.",
+    )
+    args = parser.parse_args()
+    return args
+def get_controlnet(args):
+    if args.use_english:
+        sd_model_name = 'runwayml/stable-diffusion-v1-5'
+        controlnet_model_name = 'm3face/ControlnetModels'
+        if args.condition == 'mask':
+            controlnet_revision = 'segmentation-english'
+        elif args.condition == 'landmark':
+            controlnet_revision = 'landmark-english'
+        controlnet = ControlNetModel.from_pretrained(controlnet_model_name, use_safetensors=True, revision=controlnet_revision)
+        pipeline = StableDiffusionControlNetPipeline.from_pretrained(
+            sd_model_name, controlnet=controlnet, use_safetensors=True, safety_checker=None
+        ).to("cuda")
+        pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config)
+        pipeline.enable_model_cpu_offload()
+    else:
+        sd_model_name = 'BAAI/AltDiffusion-m18'
+        controlnet_model_name = 'm3face/ControlnetModels'
+        if args.condition == 'mask':
+            controlnet_revision = 'segmentation-mlin'
+        elif args.condition == 'landmark':
+            controlnet_revision = 'landmark-mlin'
+        vae = AutoencoderKL.from_pretrained(sd_model_name, subfolder="vae")
+        unet = UNet2DConditionModel.from_pretrained(sd_model_name, subfolder="unet")
+        tokenizer = AutoTokenizer.from_pretrained(sd_model_name, subfolder="tokenizer", use_fast=False)
+        text_encoder = RobertaSeriesModelWithTransformation.from_pretrained(sd_model_name, subfolder="text_encoder")
+        controlnet = ControlNetModel.from_pretrained(controlnet_model_name, revision=controlnet_revision)
+        scheduler = PNDMScheduler.from_pretrained(
+            sd_model_name,
+            subfolder='scheduler',
+        )
+        scheduler = UniPCMultistepScheduler.from_config(scheduler.config)
+        feature_extractor = CLIPFeatureExtractor.from_pretrained(
+            sd_model_name,
+            subfolder='feature_extractor',
+        )
+        pipeline = StableDiffusionControlNetPipeline(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=None,
+            feature_extractor=feature_extractor,
+        ).to('cuda')
+    return pipeline
+def get_muse(args):
+    muse_model_name = 'm3face/FaceConditioning'
+    if args.condition == 'mask':
+        muse_revision = 'segmentation'
+    elif args.condition == 'landmark':
+        muse_revision = 'landmark'
+    scheduler = AmusedScheduler.from_pretrained(muse_model_name, revision=muse_revision, subfolder='scheduler')
+    vqvae = VQModel.from_pretrained(muse_model_name, revision=muse_revision, subfolder='vqvae')
+    uvit2 = UVit2DModel.from_pretrained(muse_model_name, revision=muse_revision, subfolder='transformer')
+    text_encoder = MultilingualCLIP.from_pretrained(muse_model_name, revision=muse_revision, subfolder='text_encoder')
+    tokenizer = AutoTokenizer.from_pretrained(muse_model_name, revision=muse_revision, subfolder='tokenizer')
+    pipeline = AmusedPipeline(
+        vqvae=vqvae,
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        transformer=uvit2,
+        scheduler=scheduler
+    ).to("cuda")
+    return pipeline
+if __name__ == '__main__':
+    args = parse_args()
+    # ========== set up face generation pipeline ==========
+    controlnet = get_controlnet(args)
+    # ========== set output directory ==========
+    os.makedirs(args.output_dir, exist_ok=True)
+    # ========== set random seed ==========
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator().manual_seed(args.seed)
+    # ========== generation ==========
+    id = int(time.time())
+    if args.condition_path:
+        condition = load_image(args.condition_path).resize((512, 512))
+    else:
+        # generate condition
+        muse = get_muse(args)
+        if args.condition == 'mask':
+            muse_added_prompt = 'Generate face segmentation | '
+        elif args.condition == 'landmark':
+            muse_added_prompt = 'Generate face landmark | '
+        muse_prompt = muse_added_prompt + args.prompt
+        condition = muse(muse_prompt, num_inference_steps=256).images[0].resize((512, 512))
+        if args.save_condition:
+            condition.save(f'{args.output_dir}/{id}_condition.png')
+    latents = torch.randn((args.num_samples, 4, 64, 64), generator=generator)
+    prompt = f'{args.prompt}, {args.additional_prompt}' if args.prompt else args.additional_prompt
+    images = controlnet(prompt, image=condition, num_inference_steps=args.num_inference_steps, negative_prompt=args.negative_prompt,
+                        generator=generator, latents=latents, num_images_per_prompt=args.num_samples).images
+    for i, image in enumerate(images):
+        image.save(f'{args.output_dir}/{id}_{i}.png')

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+diffusers
+datasets
+transformers
+accelerate
+xformers==0.0.21
+face-alignment
+gdown

utils/dml_csr/dml_csr.py ADDED Viewed

	@@ -0,0 +1,103 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   dml_csr.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :
+@License    :   Licensed under the Apache License, Version 2.0 (the "License");
+@Copyright  :   Copyright 2015 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import torch.nn as nn
+from torch.nn import functional as F
+from inplace_abn import InPlaceABNSync
+from .modules.ddgcn import DDualGCNHead
+from .modules.parsing import Parsing
+from .modules.edges import Edges
+from .modules.util import Bottleneck
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+class DML_CSR(nn.Module):
+    def __init__(self,
+                 num_classes,
+                 abn=InPlaceABNSync,
+                 trained=True):
+        super().__init__()
+        self.inplanes = 128
+        self.is_trained = trained
+        self.conv1 = conv3x3(3, 64, stride=2)
+        self.bn1 = abn(64)
+        self.relu1 = nn.ReLU(inplace=False)
+        self.conv2 = conv3x3(64, 64)
+        self.bn2 = abn(64)
+        self.relu2 = nn.ReLU(inplace=False)
+        self.conv3 = conv3x3(64, 128)
+        self.bn3 = abn(128)
+        self.relu3 = nn.ReLU(inplace=False)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layers = [3, 4, 23, 3]
+        self.abn = abn
+        strides = [1, 2, 1, 1]
+        dilations = [1, 1, 1, 2]
+        self.layer1 = self._make_layer(Bottleneck, 64, self.layers[0], stride=strides[0], dilation=dilations[0])
+        self.layer2 = self._make_layer(Bottleneck, 128, self.layers[1], stride=strides[1], dilation=dilations[1])
+        self.layer3 = self._make_layer(Bottleneck, 256, self.layers[2], stride=strides[2], dilation=dilations[2])
+        self.layer4 = self._make_layer(Bottleneck, 512, self.layers[3], stride=strides[3], dilation=dilations[3], multi_grid=(1,1,1))
+        # Context Aware
+        self.context = DDualGCNHead(2048, 512, abn)
+        self.layer6 = Parsing(512, 256, num_classes, abn)
+        # edge
+        if self.is_trained:
+            self.edge_layer = Edges(abn, out_fea=num_classes)
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, multi_grid=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                self.abn(planes * block.expansion, affine=True))
+        layers = []
+        generate_multi_grid = lambda index, grids: grids[index%len(grids)] if isinstance(grids, tuple) else 1
+        layers.append(block(self.inplanes, planes, stride, abn=self.abn, dilation=dilation, downsample=downsample, multi_grid=generate_multi_grid(0, multi_grid)))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, abn=self.abn, dilation=dilation, multi_grid=generate_multi_grid(i, multi_grid)))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        input = x
+        x = self.relu1(self.bn1(self.conv1(x)))
+        x = self.relu2(self.bn2(self.conv2(x)))
+        x1 = self.relu3(self.bn3(self.conv3(x)))
+        x = self.maxpool(x1)
+        x2 = self.layer1(x) # 119 x 119
+        x3 = self.layer2(x2) # 60 x 60
+        x4 = self.layer3(x3) # 60 x 60
+        x5 = self.layer4(x4) # 60 x 60
+        x = self.context(x5)
+        seg, x = self.layer6(x, x2)
+        if self.is_trained:
+            binary_edge, semantic_edge, edge_fea = self.edge_layer(x2,x3,x4)
+            return seg, binary_edge, semantic_edge
+        return seg

utils/dml_csr/modules/ddgcn.py ADDED Viewed

	@@ -0,0 +1,182 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   ddgcn.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :
+@License    :   Licensed under the Apache License, Version 2.0 (the "License");
+@Copyright  :   Copyright 2022 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from inplace_abn import InPlaceABNSync
+class SpatialGCN(nn.Module):
+    def __init__(self, plane, abn=InPlaceABNSync):
+        super(SpatialGCN, self).__init__()
+        inter_plane = plane // 2
+        self.node_k = nn.Conv2d(plane, inter_plane, kernel_size=1)
+        self.node_v = nn.Conv2d(plane, inter_plane, kernel_size=1)
+        self.node_q = nn.Conv2d(plane, inter_plane, kernel_size=1)
+        self.conv_wg = nn.Conv1d(inter_plane, inter_plane, kernel_size=1, bias=False)
+        self.bn_wg   = nn.BatchNorm1d(inter_plane)
+        self.softmax = nn.Softmax(dim=2)
+        self.out = nn.Sequential(nn.Conv2d(inter_plane, plane, kernel_size=1),
+                                 abn(plane))
+        self.gamma = nn.Parameter(torch.zeros(1))
+    def forward(self, x):
+        # b, c, h, w = x.size()
+        node_k = self.node_k(x)
+        node_v = self.node_v(x)
+        node_q = self.node_q(x)
+        b,c,h,w = node_k.size()
+        node_k = node_k.view(b, c, -1).permute(0, 2, 1)
+        node_q = node_q.view(b, c, -1)
+        node_v = node_v.view(b, c, -1).permute(0, 2, 1)
+        # A = k * q
+        # AV = k * q * v
+        # AVW = k *(q *v) * w
+        AV = torch.bmm(node_q,node_v)
+        AV = self.softmax(AV)
+        AV = torch.bmm(node_k, AV)
+        AV = AV.transpose(1, 2).contiguous()
+        AVW = self.conv_wg(AV)
+        AVW = self.bn_wg(AVW)
+        AVW = AVW.view(b, c, h, -1)
+        # out = F.relu_(self.out(AVW) + x)
+        out = self.gamma * self.out(AVW) + x
+        return out
+class DDualGCN(nn.Module):
+    """
+        Feature GCN with coordinate GCN
+    """
+    def __init__(self, planes, abn=InPlaceABNSync, ratio=4):
+        super(DDualGCN, self).__init__()
+        self.phi      = nn.Conv2d(planes, planes // ratio * 2, kernel_size=1, bias=False)
+        self.bn_phi   = abn(planes // ratio * 2)
+        self.theta    = nn.Conv2d(planes, planes // ratio, kernel_size=1, bias=False)
+        self.bn_theta = abn(planes // ratio)
+        #  Interaction Space
+        #  Adjacency Matrix: (-)A_g
+        self.conv_adj = nn.Conv1d(planes // ratio, planes // ratio, kernel_size=1, bias=False)
+        self.bn_adj   = nn.BatchNorm1d(planes // ratio)
+        #  State Update Function: W_g
+        self.conv_wg = nn.Conv1d(planes // ratio * 2, planes // ratio * 2, kernel_size=1, bias=False)
+        self.bn_wg   = nn.BatchNorm1d(planes // ratio * 2)
+        #  last fc
+        self.conv3 = nn.Conv2d(planes // ratio * 2, planes, kernel_size=1, bias=False)
+        self.bn3   = abn(planes)
+        self.local = nn.Sequential(
+            nn.Conv2d(planes, planes, 3, groups=planes, stride=2, padding=1, bias=False),
+            abn(planes),
+            nn.Conv2d(planes, planes, 3, groups=planes, stride=2, padding=1, bias=False),
+            abn(planes),
+            nn.Conv2d(planes, planes, 3, groups=planes, stride=2, padding=1, bias=False),
+            abn(planes))
+        self.gcn_local_attention = SpatialGCN(planes, abn)
+        self.final = nn.Sequential(nn.Conv2d(planes * 2, planes, kernel_size=1, bias=False),
+                                   abn(planes))
+        self.gamma1 = nn.Parameter(torch.zeros(1))
+    def to_matrix(self, x):
+        n, c, h, w = x.size()
+        x = x.view(n, c, -1)
+        return x
+    def forward(self, feat):
+        # # # # Local # # # #
+        x = feat
+        local = self.local(feat)
+        local = self.gcn_local_attention(local)
+        local = F.interpolate(local, size=x.size()[2:], mode='bilinear', align_corners=True)
+        spatial_local_feat = x * local + x
+        # # # # Projection Space # # # #
+        x_sqz, b = x, x
+        x_sqz = self.phi(x_sqz)
+        x_sqz = self.bn_phi(x_sqz)
+        x_sqz = self.to_matrix(x_sqz)
+        b = self.theta(b)
+        b = self.bn_theta(b)
+        b = self.to_matrix(b)
+        # Project
+        z_idt = torch.matmul(x_sqz, b.transpose(1, 2))  # channel
+        # # # # Interaction Space # # # #
+        z = z_idt.transpose(1, 2).contiguous()
+        z = self.conv_adj(z)
+        z = self.bn_adj(z)
+        z = z.transpose(1, 2).contiguous()
+        # Laplacian smoothing: (I - A_g)Z => Z - A_gZ
+        z += z_idt
+        z = self.conv_wg(z)
+        z = self.bn_wg(z)
+        # # # # Re-projection Space # # # #
+        # Re-project
+        y = torch.matmul(z, b)
+        n, _, h, w = x.size()
+        y = y.view(n, -1, h, w)
+        y = self.conv3(y)
+        y = self.bn3(y)
+        # g_out = x + y
+        # g_out = F.relu_(x+y)
+        g_out = self.gamma1*y + x
+        # cat or sum, nearly the same results
+        out = self.final(torch.cat((spatial_local_feat, g_out), 1))
+        return out
+class DDualGCNHead(nn.Module):
+    def __init__(self, inplanes, interplanes, abn=InPlaceABNSync):
+        super(DDualGCNHead, self).__init__()
+        self.conva = nn.Sequential(nn.Conv2d(inplanes, interplanes, 3, padding=1, bias=False),
+                                   abn(interplanes))
+        self.dualgcn = DDualGCN(interplanes, abn)
+        self.convb = nn.Sequential(nn.Conv2d(interplanes, interplanes, 3, padding=1, bias=False),
+                                   abn(interplanes))
+        self.bottleneck = nn.Sequential(
+            nn.Conv2d(inplanes + interplanes, interplanes, kernel_size=3, padding=1, dilation=1, bias=False),
+            abn(interplanes)
+        )
+    def forward(self, x):
+        output = self.conva(x)
+        output = self.dualgcn(output)
+        output = self.convb(output)
+        output = self.bottleneck(torch.cat([x, output], 1))
+        return output

utils/dml_csr/modules/edges.py ADDED Viewed

	@@ -0,0 +1,66 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   edges.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :
+@License    :   Licensed under the Apache License, Version 2.0 (the "License");
+@Copyright  :   Copyright 2022 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from inplace_abn import InPlaceABNSync
+class Edges(nn.Module):
+    def __init__(self, abn=InPlaceABNSync, in_fea=[256,512,1024], mid_fea=256, out_fea=2):
+        super(Edges, self).__init__()
+        self.conv1 =  nn.Sequential(
+            nn.Conv2d(in_fea[0], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            abn(mid_fea)
+            )
+        self.conv2 =  nn.Sequential(
+            nn.Conv2d(in_fea[1], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            abn(mid_fea)
+            )
+        self.conv3 =  nn.Sequential(
+            nn.Conv2d(in_fea[2], mid_fea, kernel_size=1, padding=0, dilation=1, bias=False),
+            abn(mid_fea)
+        )
+        self.conv4 = nn.Conv2d(mid_fea,out_fea, kernel_size=3, padding=1, dilation=1, bias=True)
+        self.conv5_b = nn.Conv2d(out_fea*3,2, kernel_size=1, padding=0, dilation=1, bias=True)
+        self.conv5 = nn.Conv2d(out_fea*3,out_fea, kernel_size=1, padding=0, dilation=1, bias=True)
+    def forward(self, x1, x2, x3):
+        _, _, h, w = x1.size()
+        edge1_fea = self.conv1(x1)
+        edge1 = self.conv4(edge1_fea)
+        edge2_fea = self.conv2(x2)
+        edge2 = self.conv4(edge2_fea)
+        edge3_fea = self.conv3(x3)
+        edge3 = self.conv4(edge3_fea)
+        edge2_fea =  F.interpolate(edge2_fea, size=(h, w), mode='bilinear',align_corners=True)
+        edge3_fea =  F.interpolate(edge3_fea, size=(h, w), mode='bilinear',align_corners=True)
+        edge2 =  F.interpolate(edge2, size=(h, w), mode='bilinear',align_corners=True)
+        edge3 =  F.interpolate(edge3, size=(h, w), mode='bilinear',align_corners=True)
+        edge = torch.cat([edge1, edge2, edge3], dim=1)
+        edge_fea = torch.cat([edge1_fea, edge2_fea, edge3_fea], dim=1)
+        semantic_edge = self.conv5(edge)
+        binary_edge = self.conv5_b(edge)
+        return binary_edge, semantic_edge, edge_fea

utils/dml_csr/modules/parsing.py ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   parsing.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :
+@License    :   Licensed under the Apache License, Version 2.0 (the "License");
+@Copyright  :   Copyright 2022 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from inplace_abn import InPlaceABNSync
+class Parsing(nn.Module):
+    def __init__(self, in_plane1, in_plane2, num_classes, abn=InPlaceABNSync):
+        super(Parsing, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_plane1, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            abn(256)
+            )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(in_plane2, 48, kernel_size=1, stride=1, padding=0, dilation=1, bias=False),
+            abn(48)
+            )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(304, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            abn(256),
+            nn.Conv2d(256, 256, kernel_size=1, padding=0, dilation=1, bias=False),
+            abn(256)
+            )
+        self.conv4 = nn.Conv2d(256, num_classes, kernel_size=1, padding=0, dilation=1, bias=True)
+    def forward(self, xt, xl):
+        _, _, h, w = xl.size()
+        xt = F.interpolate(self.conv1(xt), size=(h, w), mode='bilinear', align_corners=True)
+        xl = self.conv2(xl)
+        x = torch.cat([xt, xl], dim=1)
+        x = self.conv3(x)
+        seg = self.conv4(x)
+        return seg, x

utils/dml_csr/modules/util.py ADDED Viewed

	@@ -0,0 +1,58 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   util.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :
+@License    :   Licensed under the Apache License, Version 2.0 (the "License");
+@Copyright  :   Copyright 2022 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import torch.nn as nn
+from inplace_abn import InPlaceABNSync
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1, abn=InPlaceABNSync, dilation=1, downsample=None, fist_dilation=1, multi_grid=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = abn(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=dilation*multi_grid, dilation=dilation*multi_grid, bias=False)
+        self.bn2 = abn(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = abn(planes * 4)
+        self.relu = nn.ReLU(inplace=False)
+        self.relu_inplace = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.dilation = dilation
+        self.stride = stride
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out = out + residual
+        out = self.relu_inplace(out)
+        return out

utils/dml_csr/transforms.py ADDED Viewed

	@@ -0,0 +1,122 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@Author     :   Qingping Zheng
+@Contact    :   qingpingzheng2014@gmail.com
+@File       :   transforms.py
+@Time       :   10/01/21 00:00 PM
+@Desc       :
+@License    :   Licensed under the Apache License, Version 2.0 (the "License");
+@Copyright  :   Copyright 2022 The Authors. All Rights Reserved.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import cv2
+def flip_back(output_flipped, matched_parts):
+    '''
+    ouput_flipped: numpy.ndarray(batch_size, num_joints, height, width)
+    '''
+    assert output_flipped.ndim == 4,\
+        'output_flipped should be [batch_size, num_joints, height, width]'
+    output_flipped = output_flipped[:, :, :, ::-1]
+    for pair in matched_parts:
+        tmp = output_flipped[:, pair[0], :, :].copy()
+        output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
+        output_flipped[:, pair[1], :, :] = tmp
+    return output_flipped
+def transform_parsing(pred, center, scale, width, height, input_size):
+    if center is not None:
+        trans = get_affine_transform(center, scale, 0, input_size, inv=1)
+        target_pred = cv2.warpAffine(
+                pred,
+                trans,
+                (int(width), int(height)), #(int(width), int(height)),
+                flags=cv2.INTER_NEAREST,
+                borderMode=cv2.BORDER_CONSTANT,
+                borderValue=(0))
+    else:
+        target_pred = cv2.resize(pred, (int(width), int(height)), interpolation=cv2.INTER_NEAREST)
+    return target_pred
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=np.array([0, 0], dtype=np.float32),
+                         inv=0):
+    if not isinstance(scale, np.ndarray) and not isinstance(scale, list):
+        print(scale)
+        scale = np.array([scale, scale])
+    scale_tmp = scale
+    src_w = scale_tmp[0]
+    dst_w = output_size[1]
+    dst_h = output_size[0]
+    rot_rad = np.pi * rot / 180
+    src_dir = get_dir([0, src_w * -0.5], rot_rad)
+    dst_dir = np.array([0, dst_w * -0.5], np.float32)
+    src = np.zeros((3, 2), dtype=np.float32)
+    dst = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    src[2:, :] = get_3rd_point(src[0, :], src[1, :])
+    dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :])
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+    return trans
+def affine_transform(pt, t):
+    new_pt = np.array([pt[0], pt[1], 1.]).T
+    new_pt = np.dot(t, new_pt)
+    return new_pt[:2]
+def get_3rd_point(a, b):
+    direct = a - b
+    return b + np.array([-direct[1], direct[0]], dtype=np.float32)
+def get_dir(src_point, rot_rad):
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+    src_result = [0, 0]
+    src_result[0] = src_point[0] * cs - src_point[1] * sn
+    src_result[1] = src_point[0] * sn + src_point[1] * cs
+    return src_result
+def crop(img, center, scale, output_size, rot=0):
+    trans = get_affine_transform(center, scale, rot, output_size)
+    dst_img = cv2.warpAffine(img,
+                             trans,
+                             (int(output_size[1]), int(output_size[0])),
+                             flags=cv2.INTER_LINEAR)
+    return dst_img

utils/mclip.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import torch
+import transformers
+from typing import Union, Optional, Tuple
+from transformers import AutoConfig, AutoModel
+from transformers.models.clip.modeling_clip import CLIPTextModelOutput
+class MCLIPConfig(transformers.PretrainedConfig):
+    model_type = "M-CLIP"
+    def __init__(self, modelBase='xlm-roberta-large', transformerDimSize=1024, imageDimSize=768, **kwargs):
+        self.transformerDimensions = transformerDimSize
+        self.numDims = imageDimSize
+        self.modelBase = modelBase
+        super().__init__(**kwargs)
+class MultilingualCLIP(transformers.PreTrainedModel):
+    config_class = MCLIPConfig
+    def __init__(self, config, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+        self.transformer = transformers.AutoModel.from_pretrained(config.modelBase)
+        self.LinearTransformation = torch.nn.Linear(in_features=config.transformerDimensions,
+                                                    out_features=config.numDims)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPTextModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        text_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = text_outputs[1]
+        text_embeds = self.LinearTransformation(pooled_output)
+        if not return_dict:
+            outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+        return CLIPTextModelOutput(
+            text_embeds=text_embeds,
+            last_hidden_state=text_outputs.last_hidden_state,
+            hidden_states=text_outputs.hidden_states,
+            attentions=text_outputs.attentions,
+        )
+    @classmethod
+    def _load_state_dict_into_model(cls, model, state_dict, pretrained_model_name_or_path, _fast_init=True):
+        model.load_state_dict(state_dict)
+        return model, [], [], []
+AutoConfig.register("M-CLIP", MCLIPConfig)
+AutoModel.register(MCLIPConfig, MultilingualCLIP)

utils/plot_landmark.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import os
+import PIL
+import cv2
+import pickle
+import argparse
+import numpy as np
+import face_alignment
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.path import Path
+def parse_args():
+    parser = argparse.ArgumentParser(description="Plot facial landmarks from an image.")
+    parser.add_argument(
+        "--image_path",
+        type=str,
+        default=None,
+        help="Path to the image file."
+    )
+    parser.add_argument("--size", type=int, default=512)
+    parser.add_argument("--crop", action="store_true", help="Crop around the face image.")
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output/landmarks/",
+        help="Folder to save landmark images."
+    )
+    args = parser.parse_args()
+    return args
+def get_patch(landmarks, color='lime', closed=False):
+    contour = landmarks
+    ops = [Path.MOVETO] + [Path.LINETO]*(len(contour)-1)
+    facecolor = (0, 0, 0, 0)      # Transparent fill color, if open
+    if closed:
+        contour.append(contour[0])
+        ops.append(Path.CLOSEPOLY)
+        facecolor = color
+    path = Path(contour, ops)
+    return patches.PathPatch(path, facecolor=facecolor, edgecolor=color, lw=4)
+def bbox_from_landmarks(landmarks):
+    landmarks_x, landmarks_y = zip(*landmarks)
+    x_min, x_max = min(landmarks_x), max(landmarks_x)
+    y_min, y_max = min(landmarks_y), max(landmarks_y)
+    width = x_max - x_min
+    height = y_max - y_min
+    # Give it a little room; I think it works anyway
+    x_min -= 25
+    y_min -= 25
+    width += 50
+    height += 50
+    bbox = (x_min, y_min, width, height)
+    return bbox
+def plot_landmarks(landmarks, crop=False, size=512):
+    if crop:
+        (x_min, y_min, width, height) = bbox_from_landmarks(landmarks)
+        # print(x_min, y_min, width, height)
+        landmarks_np = np.array(landmarks)
+        landmarks_np[:, 0] = (landmarks_np[:, 0] - x_min) * size / width
+        landmarks_np[:, 1] = (landmarks_np[:, 1] - y_min) * size / height
+        landmarks = landmarks_np.tolist()
+    # Precisely control output image size
+    dpi = 72
+    fig, ax = plt.subplots(1, figsize=[size/dpi, size/dpi], tight_layout={'pad':0})
+    fig.set_dpi(dpi)
+    black = np.zeros((size, size, 3))
+    ax.imshow(black)
+    face_patch = get_patch(landmarks[0:17])
+    l_eyebrow = get_patch(landmarks[17:22], color='yellow')
+    r_eyebrow = get_patch(landmarks[22:27], color='yellow')
+    nose_v = get_patch(landmarks[27:31], color='orange')
+    nose_h = get_patch(landmarks[31:36], color='orange')
+    l_eye = get_patch(landmarks[36:42], color='magenta', closed=True)
+    r_eye = get_patch(landmarks[42:48], color='magenta', closed=True)
+    outer_lips = get_patch(landmarks[48:60], color='cyan', closed=True)
+    inner_lips = get_patch(landmarks[60:68], color='blue', closed=True)
+    ax.add_patch(face_patch)
+    ax.add_patch(l_eyebrow)
+    ax.add_patch(r_eyebrow)
+    ax.add_patch(nose_v)
+    ax.add_patch(nose_h)
+    ax.add_patch(l_eye)
+    ax.add_patch(r_eye)
+    ax.add_patch(outer_lips)
+    ax.add_patch(inner_lips)
+    plt.axis('off')
+    fig.canvas.draw()
+    buffer, (width, height) = fig.canvas.print_to_buffer()
+    assert width == height
+    assert width == size
+    buffer = np.frombuffer(buffer, np.uint8).reshape((height, width, 4))
+    buffer = buffer[:, :, 0:3]
+    plt.close(fig)
+    return PIL.Image.fromarray(buffer)
+def get_landmarks(image):
+    fa = face_alignment.FaceAlignment(face_alignment.LandmarksType.TWO_D, flip_input=False, face_detector='sfd')
+    faces = fa.get_landmarks_from_image(image)
+    if faces is None or len(faces) == 0:
+        return None
+    landmarks = faces[0]
+    return landmarks
+def save_landmarks(args):
+    os.makedirs(args.output_dir, exist_ok=True)
+    image_name = os.path.basename(args.image_path)
+    image = cv2.imread(args.image_path)
+    image = cv2.resize(image, (args.size, args.size))
+    landmarks = get_landmarks(image)
+    if landmarks is None:
+        print(f'No faces found in {image_name}')
+        return
+    filename = f'{args.output_dir}/{image_name}'
+    if args.crop:
+        landmarks_cropped_image = plot_landmarks(landmarks.tolist(), crop=True, size=args.size)
+        landmarks_cropped_image.save(filename)
+    else:
+        landmarks_image = plot_landmarks(landmarks.tolist(), size=args.size)
+        landmarks_image.save(filename)
+    print(f'Landmark saved in {filename}')
+if __name__ == '__main__':
+    args = parse_args()
+    save_landmarks(args)

utils/plot_mask.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import os
+import cv2
+import gdown
+import shutil
+import argparse
+import numpy as np
+import torch
+import torch.backends.cudnn as cudnn
+import torchvision.transforms as transforms
+from torchvision.utils import save_image
+from inplace_abn import InPlaceABN
+from dml_csr import dml_csr
+from dml_csr import transforms as dml_transforms
+def parse_args():
+    parser = argparse.ArgumentParser(description="Plot segmentation mask of an image.")
+    parser.add_argument(
+        "--image_path",
+        type=str,
+        default=None,
+        help="Path to the image file."
+    )
+    parser.add_argument("--size", type=int, default=512)
+    parser.add_argument(
+        "--checkpoint_path",
+        type=str,
+        default='ckpt/DML_CSR/dml_csr_celebA.pth',
+        help="Path to the DML-CSR pretrained model."
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output/masks/",
+        help="Folder to save segmentation mask."
+    )
+    args = parser.parse_args()
+    return args
+def download_checkpoint():
+    os.makedirs('ckpt', exist_ok=True)
+    id = "1xttWuAj633-ujp_vcm5DtL98PP0b-sUm"
+    gdown.download(id=id, output='ckpt/DML_CSR.zip')
+    shutil.unpack_archive('ckpt/DML_CSR.zip', 'ckpt')
+    os.remove('ckpt/DML_CSR.zip')
+def box2cs(box: list) -> tuple:
+        x, y, w, h = box[:4]
+        return xywh2cs(x, y, w, h)
+def xywh2cs(x: float, y: float, w: float, h: float) -> tuple:
+    center = np.zeros((2), dtype=np.float32)
+    center[0] = x + w * 0.5
+    center[1] = y + h * 0.5
+    if w > h:
+        h = w
+    elif w < h:
+        w = h
+    scale = np.array([w * 1.0, h * 1.0], dtype=np.float32)
+    return center, scale
+def labelcolormap(N):
+    if N == 19: # CelebAMask-HQ
+        cmap = np.array([(0,  0,  0), (204, 0,  0), (76, 153, 0),
+                     (204, 204, 0), (204, 0, 204), (204, 0, 204), (255, 204, 204),
+                     (255, 204, 204), (102, 51, 0), (102, 51, 0), (102, 204, 0),
+                     (255, 255, 0), (0, 0, 153), (0, 0, 204), (255, 51, 153),
+                     (0, 204, 204), (0, 51, 0), (255, 153, 51), (0, 204, 0)],
+                     dtype=np.uint8)
+    else:
+        def uint82bin(n, count=8):
+            """returns the binary of integer n, count refers to amount of bits"""
+            return ''.join([str((n >> y) & 1) for y in range(count-1, -1, -1)])
+        cmap = np.zeros((N, 3), dtype=np.uint8)
+        for i in range(N):
+            r, g, b = 0, 0, 0
+            id = i
+            for j in range(7):
+                str_id = uint82bin(id)
+                r = r ^ (np.uint8(str_id[-1]) << (7-j))
+                g = g ^ (np.uint8(str_id[-2]) << (7-j))
+                b = b ^ (np.uint8(str_id[-3]) << (7-j))
+                id = id >> 3
+            cmap[i, 0] = r
+            cmap[i, 1] = g
+            cmap[i, 2] = b
+    return cmap
+class Colorize(object):
+    def __init__(self, n=19):
+        self.cmap = labelcolormap(n)
+        self.cmap = torch.from_numpy(self.cmap[:n])
+    def __call__(self, gray_image):
+        size = gray_image.size()
+        color_image = torch.ByteTensor(3, size[1], size[2]).fill_(0)
+        for label in range(0, len(self.cmap)):
+            mask = (label == gray_image[0]).cpu()
+            color_image[0][mask] = self.cmap[label][0]
+            color_image[1][mask] = self.cmap[label][1]
+            color_image[2][mask] = self.cmap[label][2]
+        return color_image
+def tensor2label(label_tensor, n_label):
+    label_tensor = label_tensor.cpu().float()
+    if label_tensor.size()[0] > 1:
+        label_tensor = label_tensor.max(0, keepdim=True)[1]
+    label_tensor = Colorize(n_label)(label_tensor)
+    #label_numpy = np.transpose(label_tensor.numpy(), (1, 2, 0))
+    label_numpy = label_tensor.numpy()
+    label_numpy = label_numpy / 255.0
+    return label_numpy
+def generate_label(inputs, imsize):
+    pred_batch = []
+    for input in inputs:
+        input = input.view(1, 19, imsize, imsize)
+        pred = np.squeeze(input.data.max(1)[1].cpu().numpy(), axis=0)
+        pred_batch.append(pred)
+    pred_batch = np.array(pred_batch)
+    pred_batch = torch.from_numpy(pred_batch)
+    label_batch = []
+    for p in pred_batch:
+        p = p.view(1, imsize, imsize)
+        label_batch.append(tensor2label(p, 19))
+    label_batch = np.array(label_batch)
+    label_batch = torch.from_numpy(label_batch)
+    return label_batch
+def get_mask(model, image, input_size):
+    interp = torch.nn.Upsample(size=input_size, mode='bilinear', align_corners=True)
+    image = image.unsqueeze(0)
+    with torch.no_grad():
+        outputs = model(image.cuda())
+        labels = generate_label(interp(outputs), input_size[0])
+        return labels[0]
+def save_mask(args):
+    os.makedirs(args.output_dir, exist_ok=True)
+    cudnn.benchmark = True
+    cudnn.enabled = True
+    model = dml_csr.DML_CSR(19, InPlaceABN, False)
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+    transform = transforms.Compose([transforms.ToTensor(), normalize])
+    input_size = (args.size, args.size)
+    image = cv2.imread(args.image_path, cv2.IMREAD_COLOR)
+    h, w, _ = image.shape
+    center, s = box2cs([0, 0, w - 1, h - 1])
+    r = 0
+    crop_size = np.asarray(input_size)
+    trans = dml_transforms.get_affine_transform(center, s, r, crop_size)
+    image = cv2.warpAffine(image, trans, (int(crop_size[1]), int(crop_size[0])),
+                           flags=cv2.INTER_LINEAR,
+                           borderMode=cv2.BORDER_CONSTANT,
+                           borderValue=(0, 0, 0))
+    image = transform(image)
+    if not os.path.exists(args.checkpoint_path):
+        download_checkpoint()
+    state_dict = torch.load(args.checkpoint_path, map_location='cuda:0')
+    model.load_state_dict(state_dict)
+    model.cuda()
+    model.eval()
+    mask = get_mask(model, image, input_size)
+    filename = os.path.join(args.output_dir, os.path.basename(args.image_path).split('.')[0] + '.png')
+    save_image(mask, filename)
+    print(f'Mask saved in {filename}')
+if __name__ == '__main__':
+    args = parse_args()
+    save_mask(args)