Spaces:

Kevin-thu
/

DiffMorpher

Running on T4

App Files Files Community

Kevin commited on Dec 22, 2023

Commit

6ee2eb6

•

1 Parent(s): 34b176d

Add app

Browse files

Files changed (8) hide show

.gitignore +4 -0
Biden.jpg +0 -0
Trump.jpg +0 -0
alpha_scheduler.py +54 -0
app.py +306 -0
lora_utils.py +318 -0
morph_attn.py +827 -0
requirements.txt +14 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+lora/
+__pycache__/
+results/
+core*

Biden.jpg ADDED Viewed

Trump.jpg ADDED Viewed

alpha_scheduler.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import bisect
+import torch
+import torch.nn.functional as F
+import lpips
+perceptual_loss = lpips.LPIPS()
+def distance(img_a, img_b):
+    return perceptual_loss(img_a, img_b).item()
+    # return F.mse_loss(img_a, img_b).item()
+class AlphaScheduler:
+    def __init__(self):
+        ...
+    def from_imgs(self, imgs):
+        self.__num_values = len(imgs)
+        self.__values = [0]
+        for i in range(self.__num_values - 1):
+            dis = distance(imgs[i], imgs[i + 1])
+            self.__values.append(dis)
+            self.__values[i + 1] += self.__values[i]
+        for i in range(self.__num_values):
+            self.__values[i] /= self.__values[-1]
+    def save(self, filename):
+        torch.save(torch.tensor(self.__values), filename)
+    def load(self, filename):
+        self.__values = torch.load(filename).tolist()
+        self.__num_values = len(self.__values)
+    def get_x(self, y):
+        assert y >= 0 and y <= 1
+        id = bisect.bisect_left(self.__values, y)
+        id -= 1
+        if id < 0:
+            id = 0
+        yl = self.__values[id]
+        yr = self.__values[id + 1]
+        xl = id * (1 / (self.__num_values - 1))
+        xr = (id + 1) * (1 / (self.__num_values - 1))
+        x = (y - yl) / (yr - yl) * (xr - xl) + xl
+        return x
+    def get_list(self, len=None):
+        if len is None:
+            len = self.__num_values
+        ys = torch.linspace(0, 1, len)
+        res = [self.get_x(y) for y in ys]
+        return res

app.py ADDED Viewed

	@@ -0,0 +1,306 @@

+import os
+import torch
+import numpy as np
+import cv2
+import gradio as gr
+from PIL import Image
+from datetime import datetime
+from morph_attn import DiffMorpherPipeline
+from lora_utils import train_lora
+LENGTH=480
+def train_lora_interface(
+    image,
+    prompt,
+    model_path,
+    output_path,
+    lora_steps,
+    lora_rank,
+    lora_lr,
+    num
+):
+    os.makedirs(output_path, exist_ok=True)
+    train_lora(image, prompt, output_path, model_path,
+               lora_steps=lora_steps, lora_lr=lora_lr, lora_rank=lora_rank, weight_name=f"lora_{num}.ckpt", progress=gr.Progress())
+    return f"Train LoRA {'A' if num == 0 else 'B'} Done!"
+def run_diffmorpher(
+    image_0,
+    image_1,
+    prompt_0,
+    prompt_1,
+    model_path,
+    lora_mode,
+    lamb,
+    use_adain,
+    use_reschedule,
+    num_frames,
+    fps,
+    load_lora_path_0,
+    load_lora_path_1,
+    output_path
+):
+    run_id = datetime.now().strftime("%H%M") + "_" +  datetime.now().strftime("%Y%m%d")
+    os.makedirs(output_path, exist_ok=True)
+    morpher_pipeline = DiffMorpherPipeline.from_pretrained(model_path, torch_dtype=torch.float32).to("cuda")
+    if lora_mode == "Fix LoRA 0":
+        fix_lora = 0
+    elif lora_mode == "Fix LoRA 1":
+        fix_lora = 1
+    else:
+        fix_lora = None
+    if not load_lora_path_0:
+        load_lora_path_0 = f"{output_path}/lora_0.ckpt"
+    if not load_lora_path_1:
+        load_lora_path_1 = f"{output_path}/lora_1.ckpt"
+    images = morpher_pipeline(
+        img_0=image_0,
+        img_1=image_1,
+        prompt_0=prompt_0,
+        prompt_1=prompt_1,
+        load_lora_path_0=load_lora_path_0,
+        load_lora_path_1=load_lora_path_1,
+        lamb=lamb,
+        use_adain=use_adain,
+        use_reschedule=use_reschedule,
+        num_frames=num_frames,
+        fix_lora=fix_lora,
+        progress=gr.Progress()
+    )
+    video_path = f"{output_path}/{run_id}.mp4"
+    video = cv2.VideoWriter(video_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (LENGTH, LENGTH))
+    for image in images:
+        video.write(cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR))
+    video.release()
+    cv2.destroyAllWindows()
+    return output_video.update(value=video_path)
+def run_all(
+    image_0,
+    image_1,
+    prompt_0,
+    prompt_1,
+    model_path,
+    lora_mode,
+    lamb,
+    use_adain,
+    use_reschedule,
+    num_frames,
+    fps,
+    load_lora_path_0,
+    load_lora_path_1,
+    output_path,
+    lora_steps,
+    lora_rank,
+    lora_lr
+):
+    os.makedirs(output_path, exist_ok=True)
+    train_lora(image_0, prompt_0, output_path, model_path,
+        lora_steps=lora_steps, lora_lr=lora_lr, lora_rank=lora_rank, weight_name=f"lora_0.ckpt", progress=gr.Progress())
+    train_lora(image_1, prompt_1, output_path, model_path,
+        lora_steps=lora_steps, lora_lr=lora_lr, lora_rank=lora_rank, weight_name=f"lora_1.ckpt", progress=gr.Progress())
+    return run_diffmorpher(
+        image_0,
+        image_1,
+        prompt_0,
+        prompt_1,
+        model_path,
+        lora_mode,
+        lamb,
+        use_adain,
+        use_reschedule,
+        num_frames,
+        fps,
+        load_lora_path_0,
+        load_lora_path_1,
+        output_path
+    )
+with gr.Blocks() as demo:
+    with gr.Row():
+        gr.Markdown("""
+        # Official Implementation of [DiffMorpher](https://kevin-thu.github.io/DiffMorpher_page/)
+        """)
+    original_image_0, original_image_1 = gr.State(Image.open("Trump.jpg").convert("RGB").resize((512,512), Image.BILINEAR)), gr.State(Image.open("Biden.jpg").convert("RGB").resize((512,512), Image.BILINEAR))
+    # key_points_0, key_points_1 = gr.State([]), gr.State([])
+    # to_change_points = gr.State([])
+    with gr.Row():
+        with gr.Column():
+            input_img_0 = gr.Image(type="numpy", label="Input image A", value="Trump.jpg", show_label=True, height=LENGTH, width=LENGTH, interactive=True)
+            prompt_0 = gr.Textbox(label="Prompt for image A", value="a photo of an American man", interactive=True)
+            with gr.Row():
+                train_lora_0_button = gr.Button("Train LoRA A")
+                train_lora_1_button = gr.Button("Train LoRA B")
+            # show_correspond_button = gr.Button("Show correspondence points")
+        with gr.Column():
+            input_img_1 = gr.Image(type="numpy", label="Input image B ", value="Biden.jpg", show_label=True, height=LENGTH, width=LENGTH, interactive=True)
+            prompt_1 = gr.Textbox(label="Prompt for image B", value="a photo of an American man", interactive=True)
+            with gr.Row():
+                clear_button = gr.Button("Clear All")
+                run_button = gr.Button("Run w/o LoRA training")
+        with gr.Column():
+            output_video = gr.Video(format="mp4", label="Output video", show_label=True, height=LENGTH, width=LENGTH, interactive=False)
+            lora_progress_bar = gr.Textbox(label="Display LoRA training progress", interactive=False)
+            run_all_button = gr.Button("Run!")
+        # with gr.Column():
+        #     output_video = gr.Video(label="Output video", show_label=True, height=LENGTH, width=LENGTH)
+    with gr.Row():
+        gr.Markdown("""
+        ### Usage:
+        1. Upload two images (with correspondence) and fill out the prompts.
+        2. Click **"Run!"**
+        Or:
+        1. Upload two images (with correspondence) and fill out the prompts.
+        2. Click the **"Train LoRA A/B"** button to fit two LoRAs for two images respectively. <br> &nbsp;&nbsp;
+           If you have trained LoRA A or LoRA B before, you can skip the step and fill the specific LoRA path in LoRA settings. <br> &nbsp;&nbsp;
+           Trained LoRAs are saved to `[Output Path]/lora_0.ckpt` and `[Output Path]/lora_1.ckpt` by default.
+        3. You might also change the settings below.
+        4. Click **"Run w/o LoRA training"**
+        ### Note:
+        1. To speed up the generation process, you can **ruduce the number of frames** or **turn off "Use Reschedule"** ("Use Reschedule" will double the generation time).
+        2. You can try the influence of different prompts. It seems that using the same prompts or aligned prompts works better.
+        ### Have fun!
+        """)
+    with gr.Accordion(label="Algorithm Parameters"):
+        with gr.Tab("Basic Settings"):
+            with gr.Row():
+                # local_models_dir = 'local_pretrained_models'
+                # local_models_choice = \
+                #     [os.path.join(local_models_dir,d) for d in os.listdir(local_models_dir) if os.path.isdir(os.path.join(local_models_dir,d))]
+                model_path = gr.Text(value="stabilityai/stable-diffusion-2-1-base",
+                    label="Diffusion Model Path", interactive=True
+                )
+                lamb = gr.Slider(value=0.6, minimum=0, maximum=1, step=0.1, label="Lambda for attention replacement", interactive=True)
+                lora_mode = gr.Dropdown(value="LoRA Interp",
+                    label="LoRA Interp. or Fix LoRA",
+                    choices=["LoRA Interp", "Fix LoRA A", "Fix LoRA B"],
+                    interactive=True
+                )
+                use_adain = gr.Checkbox(value=True, label="Use AdaIN", interactive=True)
+                use_reschedule = gr.Checkbox(value=True, label="Use Reschedule", interactive=True)
+            with gr.Row():
+                num_frames = gr.Number(value=15, minimum=0, label="Number of Frames", precision=0, interactive=True)
+                fps = gr.Number(value=8, minimum=0, label="FPS (Frame rate)", precision=0, interactive=True)
+                output_path = gr.Text(value="./results", label="Output Path", interactive=True)
+        with gr.Tab("LoRA Settings"):
+            with gr.Row():
+                lora_steps = gr.Number(value=200, label="LoRA training steps", precision=0, interactive=True)
+                lora_lr = gr.Number(value=0.0002, label="LoRA learning rate", interactive=True)
+                lora_rank = gr.Number(value=16, label="LoRA rank", precision=0, interactive=True)
+                # save_lora_dir = gr.Text(value="./lora", label="LoRA model save path", interactive=True)
+                load_lora_path_0 = gr.Text(value="", label="LoRA model load path for image A", interactive=True)
+                load_lora_path_1 = gr.Text(value="", label="LoRA model load path for image B", interactive=True)
+    def store_img(img):
+        image = Image.fromarray(img).convert("RGB").resize((512,512), Image.BILINEAR)
+        # resize the input to 512x512
+        # image = image.resize((512,512), Image.BILINEAR)
+        # image = np.array(image)
+        # when new image is uploaded, `selected_points` should be empty
+        return image
+    input_img_0.upload(
+        store_img,
+        [input_img_0],
+        [original_image_0]
+    )
+    input_img_1.upload(
+        store_img,
+        [input_img_1],
+        [original_image_1]
+    )
+    def clear(LENGTH):
+        return gr.Image.update(value=None, width=LENGTH, height=LENGTH), \
+            gr.Image.update(value=None, width=LENGTH, height=LENGTH), \
+            None, None, None, None
+    clear_button.click(
+        clear,
+        [gr.Number(value=LENGTH, visible=False, precision=0)],
+        [input_img_0, input_img_1, original_image_0, original_image_1, prompt_0, prompt_1]
+    )
+    train_lora_0_button.click(
+        train_lora_interface,
+        [
+         original_image_0,
+         prompt_0,
+         model_path,
+         output_path,
+         lora_steps,
+         lora_rank,
+         lora_lr,
+         gr.Number(value=0, visible=False, precision=0)
+        ],
+        [lora_progress_bar]
+    )
+    train_lora_1_button.click(
+        train_lora_interface,
+        [
+         original_image_1,
+         prompt_1,
+         model_path,
+         output_path,
+         lora_steps,
+         lora_rank,
+         lora_lr,
+         gr.Number(value=1, visible=False, precision=0)
+        ],
+        [lora_progress_bar]
+    )
+    run_button.click(
+        run_diffmorpher,
+        [
+         original_image_0,
+         original_image_1,
+         prompt_0,
+         prompt_1,
+         model_path,
+         lora_mode,
+         lamb,
+         use_adain,
+         use_reschedule,
+         num_frames,
+         fps,
+         load_lora_path_0,
+         load_lora_path_1,
+         output_path
+        ],
+        [output_video]
+    )
+    run_all_button.click(
+        run_all,
+        [
+         original_image_0,
+         original_image_1,
+         prompt_0,
+         prompt_1,
+         model_path,
+         lora_mode,
+         lamb,
+         use_adain,
+         use_reschedule,
+         num_frames,
+         fps,
+         load_lora_path_0,
+         load_lora_path_1,
+         output_path,
+         lora_steps,
+         lora_rank,
+         lora_lr
+        ],
+        [output_video]
+    )
+demo.queue().launch(debug=True)

lora_utils.py ADDED Viewed

	@@ -0,0 +1,318 @@

+from timeit import default_timer as timer
+from datetime import timedelta
+from PIL import Image
+import os
+import numpy as np
+from einops import rearrange
+import torch
+import torch.nn.functional as F
+from torchvision import transforms
+import transformers
+from accelerate import Accelerator
+from accelerate.utils import set_seed
+from packaging import version
+from PIL import Image
+import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DiffusionPipeline,
+    DPMSolverMultistepScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.loaders import AttnProcsLayers, LoraLoaderMixin
+from diffusers.models.attention_processor import (
+    AttnAddedKVProcessor,
+    AttnAddedKVProcessor2_0,
+    LoRAAttnAddedKVProcessor,
+    LoRAAttnProcessor,
+    LoRAAttnProcessor2_0,
+    SlicedAttnAddedKVProcessor,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version
+from diffusers.utils.import_utils import is_xformers_available
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.17.0")
+def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    model_class = text_encoder_config.architectures[0]
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+        return CLIPTextModel
+    elif model_class == "RobertaSeriesModelWithTransformation":
+        from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
+        return RobertaSeriesModelWithTransformation
+    elif model_class == "T5EncoderModel":
+        from transformers import T5EncoderModel
+        return T5EncoderModel
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+def tokenize_prompt(tokenizer, prompt, tokenizer_max_length=None):
+    if tokenizer_max_length is not None:
+        max_length = tokenizer_max_length
+    else:
+        max_length = tokenizer.model_max_length
+    text_inputs = tokenizer(
+        prompt,
+        truncation=True,
+        padding="max_length",
+        max_length=max_length,
+        return_tensors="pt",
+    )
+    return text_inputs
+def encode_prompt(text_encoder, input_ids, attention_mask, text_encoder_use_attention_mask=False):
+    text_input_ids = input_ids.to(text_encoder.device)
+    if text_encoder_use_attention_mask:
+        attention_mask = attention_mask.to(text_encoder.device)
+    else:
+        attention_mask = None
+    prompt_embeds = text_encoder(
+        text_input_ids,
+        attention_mask=attention_mask,
+    )
+    prompt_embeds = prompt_embeds[0]
+    return prompt_embeds
+# model_path: path of the model
+# image: input image, have not been pre-processed
+# save_lora_dir: the path to save the lora
+# prompt: the user input prompt
+# lora_steps: number of lora training step
+# lora_lr: learning rate of lora training
+# lora_rank: the rank of lora
+def train_lora(image, prompt, save_lora_dir, model_path=None, tokenizer=None, text_encoder=None, vae=None, unet=None, noise_scheduler=None, lora_steps=200, lora_lr=2e-4, lora_rank=16, weight_name=None, safe_serialization=False, progress=tqdm):
+    # initialize accelerator
+    accelerator = Accelerator(
+        gradient_accumulation_steps=1,
+        # mixed_precision='fp16'
+    )
+    set_seed(0)
+    # Load the tokenizer
+    if tokenizer is None:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            subfolder="tokenizer",
+            revision=None,
+            use_fast=False,
+        )
+    # initialize the model
+    if noise_scheduler is None:
+        noise_scheduler = DDPMScheduler.from_pretrained(model_path, subfolder="scheduler")
+    if text_encoder is None:
+        text_encoder_cls = import_model_class_from_model_name_or_path(model_path, revision=None)
+        text_encoder = text_encoder_cls.from_pretrained(
+            model_path, subfolder="text_encoder", revision=None
+        )
+    if vae is None:
+        vae = AutoencoderKL.from_pretrained(
+            model_path, subfolder="vae", revision=None
+        )
+    if unet is None:
+        unet = UNet2DConditionModel.from_pretrained(
+            model_path, subfolder="unet", revision=None
+        )
+    # set device and dtype
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    unet.requires_grad_(False)
+    unet.to(device)
+    vae.to(device)
+    text_encoder.to(device)
+    # initialize UNet LoRA
+    unet_lora_attn_procs = {}
+    for name, attn_processor in unet.attn_processors.items():
+        cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+        if name.startswith("mid_block"):
+            hidden_size = unet.config.block_out_channels[-1]
+        elif name.startswith("up_blocks"):
+            block_id = int(name[len("up_blocks.")])
+            hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+        elif name.startswith("down_blocks"):
+            block_id = int(name[len("down_blocks.")])
+            hidden_size = unet.config.block_out_channels[block_id]
+        else:
+            raise NotImplementedError("name must start with up_blocks, mid_blocks, or down_blocks")
+        if isinstance(attn_processor, (AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_0)):
+            lora_attn_processor_class = LoRAAttnAddedKVProcessor
+        else:
+            lora_attn_processor_class = (
+                LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor
+            )
+        unet_lora_attn_procs[name] = lora_attn_processor_class(
+            hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=lora_rank
+        )
+    unet.set_attn_processor(unet_lora_attn_procs)
+    unet_lora_layers = AttnProcsLayers(unet.attn_processors)
+    # Optimizer creation
+    params_to_optimize = (unet_lora_layers.parameters())
+    optimizer = torch.optim.AdamW(
+        params_to_optimize,
+        lr=lora_lr,
+        betas=(0.9, 0.999),
+        weight_decay=1e-2,
+        eps=1e-08,
+    )
+    lr_scheduler = get_scheduler(
+        "constant",
+        optimizer=optimizer,
+        num_warmup_steps=0,
+        num_training_steps=lora_steps,
+        num_cycles=1,
+        power=1.0,
+    )
+    # prepare accelerator
+    unet_lora_layers = accelerator.prepare_model(unet_lora_layers)
+    optimizer = accelerator.prepare_optimizer(optimizer)
+    lr_scheduler = accelerator.prepare_scheduler(lr_scheduler)
+    # initialize text embeddings
+    with torch.no_grad():
+        text_inputs = tokenize_prompt(tokenizer, prompt, tokenizer_max_length=None)
+        text_embedding = encode_prompt(
+            text_encoder,
+            text_inputs.input_ids,
+            text_inputs.attention_mask,
+            text_encoder_use_attention_mask=False
+        )
+    if type(image) == np.ndarray:
+        image = Image.fromarray(image)
+    # initialize latent distribution
+    image_transforms = transforms.Compose(
+        [
+            transforms.Resize(512, interpolation=transforms.InterpolationMode.BILINEAR),
+            # transforms.RandomCrop(512),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+    image = image_transforms(image).to(device)
+    image = image.unsqueeze(dim=0)
+    latents_dist = vae.encode(image).latent_dist
+    for _ in progress.tqdm(range(lora_steps), desc="Training LoRA..."):
+        unet.train()
+        model_input = latents_dist.sample() * vae.config.scaling_factor
+        # Sample noise that we'll add to the latents
+        noise = torch.randn_like(model_input)
+        bsz, channels, height, width = model_input.shape
+        # Sample a random timestep for each image
+        timesteps = torch.randint(
+            0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+        )
+        timesteps = timesteps.long()
+        # Add noise to the model input according to the noise magnitude at each timestep
+        # (this is the forward diffusion process)
+        noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+        # Predict the noise residual
+        model_pred = unet(noisy_model_input, timesteps, text_embedding).sample
+        # Get the target for loss depending on the prediction type
+        if noise_scheduler.config.prediction_type == "epsilon":
+            target = noise
+        elif noise_scheduler.config.prediction_type == "v_prediction":
+            target = noise_scheduler.get_velocity(model_input, noise, timesteps)
+        else:
+            raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+        loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+        accelerator.backward(loss)
+        optimizer.step()
+        lr_scheduler.step()
+        optimizer.zero_grad()
+    # save the trained lora
+    # unet = unet.to(torch.float32)
+    # vae = vae.to(torch.float32)
+    # text_encoder = text_encoder.to(torch.float32)
+    # unwrap_model is used to remove all special modules added when doing distributed training
+    # so here, there is no need to call unwrap_model
+    # unet_lora_layers = accelerator.unwrap_model(unet_lora_layers)
+    LoraLoaderMixin.save_lora_weights(
+        save_directory=save_lora_dir,
+        unet_lora_layers=unet_lora_layers,
+        text_encoder_lora_layers=None,
+        weight_name=weight_name,
+        safe_serialization=safe_serialization
+    )
+def load_lora(unet, lora_0, lora_1, alpha):
+    lora = {}
+    for key in lora_0:
+        lora[key] = (1 - alpha) * lora_0[key] + alpha * lora_1[key]
+    unet.load_attn_procs(lora)
+    return unet
+# import safetensors
+# unet = UNet2DConditionModel.from_pretrained(
+#             "stabilityai/stable-diffusion-2-1-base", subfolder="unet", revision=None
+#         )
+# lora = safetensors.torch.load_file("../models/lora/majicmixRealistic_betterV2V25.safetensors", device="cuda")
+# unet = safetensors.torch.load_file("../stabilityai/stable-diffusion-1-5/v1-5-pruned-emaonly.safetensors", device="cuda")
+# with open("lora.txt", "w") as f:
+#     for key in lora:
+#         f.write(f"{key} {lora[key].shape}\n")
+# with open("unet.txt", "w") as f:
+#     for key in unet:
+#         f.write(f"{key} {unet[key].shape}\n")
+# unet.load_attn_procs(lora)
+# lora_path = "models/lora"
+# image_path_1 = "input/sculpture.jpg"
+# # image_path_0 = "input/realdog0.jpg"
+# prompt = "a photo of a sculpture"
+# train_lora(Image.open(image_path_1), prompt, lora_path, "stabilityai/stable-diffusion-1-5", weight_name="sculpture_v15.safetensors", safe_serialization=True)
+# train_lora(image_path_0, prompt, "stabilityai/stable-diffusion-2-1-base", lora_path, weight_name="realdog0.ckpt")
+# realdog1_lora = torch.load(os.path.join(lora_path, "realdog1.ckpt"))
+# realdog0_lora = torch.load(os.path.join(lora_path, "realdog0.ckpt"))
+# pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base", torch_dtype=torch.float32)
+# pipe.to("cuda")
+# for t in torch.linspace(0, 1, 10):
+#     lora = {}
+#     for key in realdog0_lora:
+#         lora[key] = (1 - t) * realdog1_lora[key] + t * realdog0_lora[key]
+#     pipe.unet.load_attn_procs(lora)
+#     image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0]
+#     image.save(f"test/lora_interp/{t}.jpg")

morph_attn.py ADDED Viewed

	@@ -0,0 +1,827 @@

+import os
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.models.attention_processor import AttnProcessor
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import KarrasDiffusionSchedulers
+import torch
+import torch.nn.functional as F
+import tqdm
+import numpy as np
+import safetensors
+from PIL import Image
+from torchvision import transforms
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+from lora_utils import train_lora, load_lora
+from diffusers import StableDiffusionPipeline
+from argparse import ArgumentParser
+from alpha_scheduler import AlphaScheduler
+parser = ArgumentParser()
+parser.add_argument(
+    '--image_path_0', type=str, default='',
+    help='Path of the image to be processed (default: %(default)s)')
+parser.add_argument(
+    '--prompt_0', type=str, default='',
+    help='Prompt of the image (default: %(default)s)')
+parser.add_argument(
+    '--image_path_1', type=str, default='',
+    help='Path of the 2nd image to be processed, used in "morphing" mode (default: %(default)s)')
+parser.add_argument(
+    '--prompt_1', type=str, default='',
+    help='Prompt of the 2nd image, used in "morphing" mode (default: %(default)s)')
+parser.add_argument(
+    '--output_path', type=str, default='',
+    help='Path of the output image (default: %(default)s)'
+)
+parser.add_argument(
+    '--num_frames', type=int, default=50,
+    help='Number of frames to generate (default: %(default)s)'
+)
+parser.add_argument(
+    '--duration', type=int, default=50,
+    help='Duration of each frame (default: %(default)s)'
+)
+parser.add_argument(
+    '--use_lora', action='store_true',
+    help='Use LORA to generate images (default: False)'
+)
+parser.add_argument(
+    '--guidance_scale', type=float, default=1.,
+    help='CFG guidace (default: %(default)s)'
+)
+parser.add_argument(
+    '--attn_beta',  type=float, default=None,
+)
+parser.add_argument(
+    '-reschedule',  action='store_true',
+)
+parser.add_argument(
+    '--lamd',  type=float, default=0.6,
+)
+parser.add_argument(
+    '--use_adain', action='store_true'
+)
+args = parser.parse_args()
+# name = args.output_path.split('/')[-1]
+# attn_beta = args.attn_beta
+# num_frames = args.num_frames
+# use_alpha_scheduler = args.reschedule
+# attn_step = 50 * args.lamd
+def calc_mean_std(feat, eps=1e-5):
+    # eps is a small value added to the variance to avoid divide-by-zero.
+    size = feat.size()
+    N, C = size[:2]
+    feat_var = feat.view(N, C, -1).var(dim=2) + eps
+    if len(size) == 3:
+        feat_std = feat_var.sqrt().view(N, C, 1)
+        feat_mean = feat.view(N, C, -1).mean(dim=2).view(N, C, 1)
+    else:
+        feat_std = feat_var.sqrt().view(N, C, 1, 1)
+        feat_mean = feat.view(N, C, -1).mean(dim=2).view(N, C, 1, 1)
+    return feat_mean, feat_std
+def get_img(img, resolution=512):
+    norm_mean = [0.5, 0.5, 0.5]
+    norm_std = [0.5, 0.5, 0.5]
+    transform = transforms.Compose([
+        transforms.Resize((resolution, resolution)),
+        transforms.ToTensor(),
+        transforms.Normalize(norm_mean, norm_std)
+    ])
+    img = transform(img)
+    return img.unsqueeze(0)
+@torch.no_grad()
+def slerp(p0, p1, fract_mixing: float, adain=True):
+    r""" Copied from lunarring/latentblending
+    Helper function to correctly mix two random variables using spherical interpolation.
+    The function will always cast up to float64 for sake of extra 4.
+    Args:
+        p0:
+            First tensor for interpolation
+        p1:
+            Second tensor for interpolation
+        fract_mixing: float
+            Mixing coefficient of interval [0, 1].
+            0 will return in p0
+            1 will return in p1
+            0.x will return a mix between both preserving angular velocity.
+    """
+    if p0.dtype == torch.float16:
+        recast_to = 'fp16'
+    else:
+        recast_to = 'fp32'
+    p0 = p0.double()
+    p1 = p1.double()
+    if adain:
+        mean1, std1 = calc_mean_std(p0)
+        mean2, std2 = calc_mean_std(p1)
+        mean = mean1 * (1 - fract_mixing) + mean2 * fract_mixing
+        std = std1 * (1 - fract_mixing) + std2 * fract_mixing
+    norm = torch.linalg.norm(p0) * torch.linalg.norm(p1)
+    epsilon = 1e-7
+    dot = torch.sum(p0 * p1) / norm
+    dot = dot.clamp(-1+epsilon, 1-epsilon)
+    theta_0 = torch.arccos(dot)
+    sin_theta_0 = torch.sin(theta_0)
+    theta_t = theta_0 * fract_mixing
+    s0 = torch.sin(theta_0 - theta_t) / sin_theta_0
+    s1 = torch.sin(theta_t) / sin_theta_0
+    interp = p0*s0 + p1*s1
+    if adain:
+        interp = F.instance_norm(interp) * std + mean
+    if recast_to == 'fp16':
+        interp = interp.half()
+    elif recast_to == 'fp32':
+        interp = interp.float()
+    return interp
+def do_replace_attn(key: str):
+    # return key.startswith('up_blocks.2') or key.startswith('up_blocks.3')
+    return key.startswith('up')
+class StoreProcessor():
+    def __init__(self, original_processor, value_dict, name):
+        self.original_processor = original_processor
+        self.value_dict = value_dict
+        self.name = name
+        self.value_dict[self.name] = dict()
+        self.id = 0
+    def __call__(self, attn, hidden_states, *args, encoder_hidden_states=None, attention_mask=None, **kwargs):
+        # Is self attention
+        if encoder_hidden_states is None:
+            self.value_dict[self.name][self.id] = hidden_states.detach()
+            self.id += 1
+        res = self.original_processor(attn, hidden_states, *args,
+                                      encoder_hidden_states=encoder_hidden_states,
+                                      attention_mask=attention_mask,
+                                      **kwargs)
+        return res
+class LoadProcessor():
+    def __init__(self, original_processor, name, img0_dict, img1_dict, alpha, beta=0, lamb=0.6):
+        super().__init__()
+        self.original_processor = original_processor
+        self.name = name
+        self.img0_dict = img0_dict
+        self.img1_dict = img1_dict
+        self.alpha = alpha
+        self.beta = beta
+        self.lamb = lamb
+        self.id = 0
+    def parent_call(
+        self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0
+    ):
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(
+                hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states) + scale * \
+            self.original_processor.to_q_lora(hidden_states)
+        query = attn.head_to_batch_dim(query)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states) + scale * \
+            self.original_processor.to_k_lora(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states) + scale * \
+            self.original_processor.to_v_lora(encoder_hidden_states)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(
+            query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](
+            hidden_states) + scale * self.original_processor.to_out_lora(hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(
+                -1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+    def __call__(self, attn, hidden_states, *args, encoder_hidden_states=None, attention_mask=None, **kwargs):
+        # Is self attention
+        if encoder_hidden_states is None:
+            # hardcode timestep
+            if self.id < 50 * self.lamb:
+                map0 = self.img0_dict[self.name][self.id]
+                map1 = self.img1_dict[self.name][self.id]
+                cross_map = self.beta * hidden_states + \
+                    (1 - self.beta) * ((1 - self.alpha) * map0 + self.alpha * map1)
+                # cross_map = self.beta * hidden_states + \
+                #     (1 - self.beta) * slerp(map0, map1, self.alpha)
+                # cross_map = slerp(slerp(map0, map1, self.alpha),
+                #                   hidden_states, self.beta)
+                # cross_map = hidden_states
+                # cross_map = torch.cat(
+                #     ((1 - self.alpha) * map0, self.alpha * map1), dim=1)
+                # res = self.original_processor(attn, hidden_states, *args,
+                #                               encoder_hidden_states=cross_map,
+                #                               attention_mask=attention_mask,
+                #                               temb=temb, **kwargs)
+                res = self.parent_call(attn, hidden_states, *args,
+                                       encoder_hidden_states=cross_map,
+                                       attention_mask=attention_mask,
+                                       **kwargs)
+            else:
+                res = self.original_processor(attn, hidden_states, *args,
+                                              encoder_hidden_states=encoder_hidden_states,
+                                              attention_mask=attention_mask,
+                                              **kwargs)
+            self.id += 1
+            # if self.id == len(self.img0_dict[self.name]):
+            if self.id == len(self.img0_dict[self.name]):
+                self.id = 0
+        else:
+            res = self.original_processor(attn, hidden_states, *args,
+                                          encoder_hidden_states=encoder_hidden_states,
+                                          attention_mask=attention_mask,
+                                          **kwargs)
+        return res
+class DiffMorpherPipeline(StableDiffusionPipeline):
+    def __init__(self,
+                 vae: AutoencoderKL,
+                 text_encoder: CLIPTextModel,
+                 tokenizer: CLIPTokenizer,
+                 unet: UNet2DConditionModel,
+                 scheduler: KarrasDiffusionSchedulers,
+                 safety_checker: StableDiffusionSafetyChecker,
+                 feature_extractor: CLIPImageProcessor,
+                 requires_safety_checker: bool = True,
+                ):
+        super().__init__(vae, text_encoder, tokenizer, unet, scheduler,
+                         safety_checker, feature_extractor, requires_safety_checker)
+        self.img0_dict = dict()
+        self.img1_dict = dict()
+    def inv_step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        x: torch.FloatTensor,
+        eta=0.,
+        verbose=False
+    ):
+        """
+        Inverse sampling for DDIM Inversion
+        """
+        if verbose:
+            print("timestep: ", timestep)
+        next_step = timestep
+        timestep = min(timestep - self.scheduler.config.num_train_timesteps //
+                       self.scheduler.num_inference_steps, 999)
+        alpha_prod_t = self.scheduler.alphas_cumprod[
+            timestep] if timestep >= 0 else self.scheduler.final_alpha_cumprod
+        alpha_prod_t_next = self.scheduler.alphas_cumprod[next_step]
+        beta_prod_t = 1 - alpha_prod_t
+        pred_x0 = (x - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+        pred_dir = (1 - alpha_prod_t_next)**0.5 * model_output
+        x_next = alpha_prod_t_next**0.5 * pred_x0 + pred_dir
+        return x_next, pred_x0
+    @torch.no_grad()
+    def invert(
+            self,
+            image: torch.Tensor,
+            prompt,
+            num_inference_steps=50,
+            num_actual_inference_steps=None,
+            guidance_scale=1.,
+            eta=0.0,
+            **kwds):
+        """
+        invert a real image into noise map with determinisc DDIM inversion
+        """
+        DEVICE = torch.device(
+            "cuda") if torch.cuda.is_available() else torch.device("cpu")
+        batch_size = image.shape[0]
+        if isinstance(prompt, list):
+            if batch_size == 1:
+                image = image.expand(len(prompt), -1, -1, -1)
+        elif isinstance(prompt, str):
+            if batch_size > 1:
+                prompt = [prompt] * batch_size
+        # text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            return_tensors="pt"
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(DEVICE))[0]
+        print("input text embeddings :", text_embeddings.shape)
+        # define initial latents
+        latents = self.image2latent(image)
+        # unconditional embedding for classifier free guidance
+        if guidance_scale > 1.:
+            max_length = text_input.input_ids.shape[-1]
+            unconditional_input = self.tokenizer(
+                [""] * batch_size,
+                padding="max_length",
+                max_length=77,
+                return_tensors="pt"
+            )
+            unconditional_embeddings = self.text_encoder(
+                unconditional_input.input_ids.to(DEVICE))[0]
+            text_embeddings = torch.cat(
+                [unconditional_embeddings, text_embeddings], dim=0)
+        print("latents shape: ", latents.shape)
+        # interative sampling
+        self.scheduler.set_timesteps(num_inference_steps)
+        print("Valid timesteps: ", reversed(self.scheduler.timesteps))
+        # print("attributes: ", self.scheduler.__dict__)
+        latents_list = [latents]
+        pred_x0_list = [latents]
+        for i, t in enumerate(tqdm.tqdm(reversed(self.scheduler.timesteps), desc="DDIM Inversion")):
+            if num_actual_inference_steps is not None and i >= num_actual_inference_steps:
+                continue
+            if guidance_scale > 1.:
+                model_inputs = torch.cat([latents] * 2)
+            else:
+                model_inputs = latents
+            # predict the noise
+            noise_pred = self.unet(
+                model_inputs, t, encoder_hidden_states=text_embeddings).sample
+            if guidance_scale > 1.:
+                noise_pred_uncon, noise_pred_con = noise_pred.chunk(2, dim=0)
+                noise_pred = noise_pred_uncon + guidance_scale * \
+                    (noise_pred_con - noise_pred_uncon)
+            # compute the previous noise sample x_t-1 -> x_t
+            latents, pred_x0 = self.inv_step(noise_pred, t, latents)
+            latents_list.append(latents)
+            pred_x0_list.append(pred_x0)
+        return latents
+    @torch.no_grad()
+    def ddim_inversion(self, latent, cond):
+        timesteps = reversed(self.scheduler.timesteps)
+        with torch.autocast(device_type='cuda', dtype=torch.float32):
+            for i, t in enumerate(tqdm.tqdm(timesteps, desc="DDIM inversion")):
+                cond_batch = cond.repeat(latent.shape[0], 1, 1)
+                alpha_prod_t = self.scheduler.alphas_cumprod[t]
+                alpha_prod_t_prev = (
+                    self.scheduler.alphas_cumprod[timesteps[i - 1]]
+                    if i > 0 else self.scheduler.final_alpha_cumprod
+                )
+                mu = alpha_prod_t ** 0.5
+                mu_prev = alpha_prod_t_prev ** 0.5
+                sigma = (1 - alpha_prod_t) ** 0.5
+                sigma_prev = (1 - alpha_prod_t_prev) ** 0.5
+                eps = self.unet(
+                    latent, t, encoder_hidden_states=cond_batch).sample
+                pred_x0 = (latent - sigma_prev * eps) / mu_prev
+                latent = mu * pred_x0 + sigma * eps
+        #         if save_latents:
+        #             torch.save(latent, os.path.join(save_path, f'noisy_latents_{t}.pt'))
+        # torch.save(latent, os.path.join(save_path, f'noisy_latents_{t}.pt'))
+        return latent
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        x: torch.FloatTensor,
+    ):
+        """
+        predict the sample of the next step in the denoise process.
+        """
+        prev_timestep = timestep - \
+            self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.scheduler.alphas_cumprod[
+            prev_timestep] if prev_timestep > 0 else self.scheduler.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        pred_x0 = (x - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+        pred_dir = (1 - alpha_prod_t_prev)**0.5 * model_output
+        x_prev = alpha_prod_t_prev**0.5 * pred_x0 + pred_dir
+        return x_prev, pred_x0
+    @torch.no_grad()
+    def image2latent(self, image):
+        DEVICE = torch.device(
+            "cuda") if torch.cuda.is_available() else torch.device("cpu")
+        if type(image) is Image:
+            image = np.array(image)
+            image = torch.from_numpy(image).float() / 127.5 - 1
+            image = image.permute(2, 0, 1).unsqueeze(0)
+        # input image density range [-1, 1]
+        latents = self.vae.encode(image.to(DEVICE))['latent_dist'].mean
+        latents = latents * 0.18215
+        return latents
+    @torch.no_grad()
+    def latent2image(self, latents, return_type='np'):
+        latents = 1 / 0.18215 * latents.detach()
+        image = self.vae.decode(latents)['sample']
+        if return_type == 'np':
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
+            image = (image * 255).astype(np.uint8)
+        elif return_type == "pt":
+            image = (image / 2 + 0.5).clamp(0, 1)
+        return image
+    def latent2image_grad(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents)['sample']
+        return image  # range [-1, 1]
+    @torch.no_grad()
+    def cal_latent(self, num_inference_steps, guidance_scale, unconditioning, img_noise_0, img_noise_1, text_embeddings_0, text_embeddings_1, lora_0, lora_1, alpha, use_lora, fix_lora=None):
+        # latents = torch.cos(alpha * torch.pi / 2) * img_noise_0 + \
+        #     torch.sin(alpha * torch.pi / 2) * img_noise_1
+        # latents = (1 - alpha) * img_noise_0 + alpha * img_noise_1
+        # latents = latents / ((1 - alpha) ** 2 + alpha ** 2)
+        latents = slerp(img_noise_0, img_noise_1, alpha, self.use_adain)
+        text_embeddings = (1 - alpha) * text_embeddings_0 + \
+            alpha * text_embeddings_1
+        self.scheduler.set_timesteps(num_inference_steps)
+        if use_lora:
+            if fix_lora is not None:
+                self.unet = load_lora(self.unet, lora_0, lora_1, fix_lora)
+            else:
+                self.unet = load_lora(self.unet, lora_0, lora_1, alpha)
+        for i, t in enumerate(tqdm.tqdm(self.scheduler.timesteps, desc=f"DDIM Sampler, alpha={alpha}")):
+            if guidance_scale > 1.:
+                model_inputs = torch.cat([latents] * 2)
+            else:
+                model_inputs = latents
+            if unconditioning is not None and isinstance(unconditioning, list):
+                _, text_embeddings = text_embeddings.chunk(2)
+                text_embeddings = torch.cat(
+                    [unconditioning[i].expand(*text_embeddings.shape), text_embeddings])
+            # predict the noise
+            noise_pred = self.unet(
+                model_inputs, t, encoder_hidden_states=text_embeddings).sample
+            if guidance_scale > 1.0:
+                noise_pred_uncon, noise_pred_con = noise_pred.chunk(
+                    2, dim=0)
+                noise_pred = noise_pred_uncon + guidance_scale * \
+                    (noise_pred_con - noise_pred_uncon)
+            # compute the previous noise sample x_t -> x_t-1
+            # YUJUN: right now, the only difference between step here and step in scheduler
+            # is that scheduler version would clamp pred_x0 between [-1,1]
+            # don't know if that's gonna have huge impact
+            latents = self.scheduler.step(
+                noise_pred, t, latents, return_dict=False)[0]
+        return latents
+    @torch.no_grad()
+    def get_text_embeddings(self, prompt, guidance_scale, neg_prompt, batch_size):
+        DEVICE = torch.device(
+            "cuda") if torch.cuda.is_available() else torch.device("cpu")
+        # text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            return_tensors="pt"
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.cuda())[0]
+        if guidance_scale > 1.:
+            if neg_prompt:
+                uc_text = neg_prompt
+            else:
+                uc_text = ""
+            unconditional_input = self.tokenizer(
+                [uc_text] * batch_size,
+                padding="max_length",
+                max_length=77,
+                return_tensors="pt"
+            )
+            unconditional_embeddings = self.text_encoder(
+                unconditional_input.input_ids.to(DEVICE))[0]
+            text_embeddings = torch.cat(
+                [unconditional_embeddings, text_embeddings], dim=0)
+        return text_embeddings
+    def __call__(
+            self,
+            img_0=None,
+            img_1=None,
+            img_path_0=None,
+            img_path_1=None,
+            prompt_0="",
+            prompt_1="",
+            save_lora_dir="./lora",
+            load_lora_path_0=None,
+            load_lora_path_1=None,
+            lora_steps=200,
+            lora_lr=2e-4,
+            lora_rank=16,
+            batch_size=1,
+            height=512,
+            width=512,
+            num_inference_steps=50,
+            num_actual_inference_steps=None,
+            guidance_scale=1,
+            attn_beta=0,
+            lamb=0.6,
+            use_lora = True,
+            use_adain = True,
+            use_reschedule = True,
+            output_path = "./results",
+            num_frames=50,
+            fix_lora=None,
+            progress=tqdm,
+            unconditioning=None,
+            neg_prompt=None,
+            **kwds):
+        # if isinstance(prompt, list):
+        #     batch_size = len(prompt)
+        # elif isinstance(prompt, str):
+        #     if batch_size > 1:
+        #         prompt = [prompt] * batch_size
+        self.scheduler.set_timesteps(num_inference_steps)
+        self.use_lora = use_lora
+        self.use_adain = use_adain
+        self.use_reschedule = use_reschedule
+        self.output_path = output_path
+        if img_0 is None:
+            img_0 = Image.open(img_path_0).convert("RGB")
+        # else:
+        #     img_0 = Image.fromarray(img_0).convert("RGB")
+        if img_1 is None:
+            img_1 = Image.open(img_path_1).convert("RGB")
+        # else:
+        #     img_1 = Image.fromarray(img_1).convert("RGB")
+        if self.use_lora:
+            print("Loading lora...")
+            if not load_lora_path_0:
+                weight_name = f"{output_path.split('/')[-1]}_lora_0.ckpt"
+                load_lora_path_0 = save_lora_dir + "/" + weight_name
+                if not os.path.exists(load_lora_path_0):
+                    train_lora(img_0, prompt_0, save_lora_dir, None, self.tokenizer, self.text_encoder,
+                               self.vae, self.unet, self.scheduler, lora_steps, lora_lr, lora_rank, weight_name=weight_name)
+            print(f"Load from {load_lora_path_0}.")
+            if load_lora_path_0.endswith(".safetensors"):
+                lora_0 = safetensors.torch.load_file(
+                    load_lora_path_0, device="cpu")
+            else:
+                lora_0 = torch.load(load_lora_path_0, map_location="cpu")
+            if not load_lora_path_1:
+                weight_name = f"{output_path.split('/')[-1]}_lora_1.ckpt"
+                load_lora_path_1 = save_lora_dir + "/" + weight_name
+                if not os.path.exists(load_lora_path_1):
+                    train_lora(img_1, prompt_1, save_lora_dir, None, self.tokenizer, self.text_encoder,
+                               self.vae, self.unet, self.scheduler, lora_steps, lora_lr, lora_rank, weight_name=weight_name)
+            print(f"Load from {load_lora_path_1}.")
+            if load_lora_path_1.endswith(".safetensors"):
+                lora_1 = safetensors.torch.load_file(
+                    load_lora_path_1, device="cpu")
+            else:
+                lora_1 = torch.load(load_lora_path_1, map_location="cpu")
+        text_embeddings_0 = self.get_text_embeddings(
+            prompt_0, guidance_scale, neg_prompt, batch_size)
+        text_embeddings_1 = self.get_text_embeddings(
+            prompt_1, guidance_scale, neg_prompt, batch_size)
+        img_0 = get_img(img_0)
+        img_1 = get_img(img_1)
+        if self.use_lora:
+            self.unet = load_lora(self.unet, lora_0, lora_1, 0)
+        img_noise_0 = self.ddim_inversion(
+            self.image2latent(img_0), text_embeddings_0)
+        if self.use_lora:
+            self.unet = load_lora(self.unet, lora_0, lora_1, 1)
+        img_noise_1 = self.ddim_inversion(
+            self.image2latent(img_1), text_embeddings_1)
+        print("latents shape: ", img_noise_0.shape)
+        def morph(alpha_list, progress, desc, save=False):
+            images = []
+            if attn_beta is not None:
+                self.unet = load_lora(self.unet, lora_0, lora_1, 0 if fix_lora is None else fix_lora)
+                attn_processor_dict = {}
+                for k in self.unet.attn_processors.keys():
+                    if do_replace_attn(k):
+                        attn_processor_dict[k] = StoreProcessor(self.unet.attn_processors[k],
+                                                                self.img0_dict, k)
+                    else:
+                        attn_processor_dict[k] = self.unet.attn_processors[k]
+                self.unet.set_attn_processor(attn_processor_dict)
+                latents = self.cal_latent(
+                    num_inference_steps,
+                    guidance_scale,
+                    unconditioning,
+                    img_noise_0,
+                    img_noise_1,
+                    text_embeddings_0,
+                    text_embeddings_1,
+                    lora_0,
+                    lora_1,
+                    alpha_list[0],
+                    False,
+                    fix_lora
+                )
+                first_image = self.latent2image(latents)
+                first_image = Image.fromarray(first_image)
+                if save:
+                    first_image.save(f"{self.output_path}/{0:02d}.png")
+                self.unet = load_lora(self.unet, lora_0, lora_1, 1 if fix_lora is None else fix_lora)
+                attn_processor_dict = {}
+                for k in self.unet.attn_processors.keys():
+                    if do_replace_attn(k):
+                        attn_processor_dict[k] = StoreProcessor(self.unet.attn_processors[k],
+                                                                self.img1_dict, k)
+                    else:
+                        attn_processor_dict[k] = self.unet.attn_processors[k]
+                self.unet.set_attn_processor(attn_processor_dict)
+                latents = self.cal_latent(
+                    num_inference_steps,
+                    guidance_scale,
+                    unconditioning,
+                    img_noise_0,
+                    img_noise_1,
+                    text_embeddings_0,
+                    text_embeddings_1,
+                    lora_0,
+                    lora_1,
+                    alpha_list[-1],
+                    False,
+                    fix_lora
+                )
+                last_image = self.latent2image(latents)
+                last_image = Image.fromarray(last_image)
+                if save:
+                    last_image.save(
+                        f"{self.output_path}/{num_frames - 1:02d}.png")
+                for i in progress.tqdm(range(1, num_frames - 1), desc=desc):
+                    alpha = alpha_list[i]
+                    self.unet = load_lora(self.unet, lora_0, lora_1, alpha if fix_lora is None else fix_lora)
+                    attn_processor_dict = {}
+                    for k in self.unet.attn_processors.keys():
+                        if do_replace_attn(k):
+                            attn_processor_dict[k] = LoadProcessor(
+                                self.unet.attn_processors[k], k, self.img0_dict, self.img1_dict, alpha, attn_beta, lamb)
+                        else:
+                            attn_processor_dict[k] = self.unet.attn_processors[k]
+                    self.unet.set_attn_processor(attn_processor_dict)
+                    latents = self.cal_latent(
+                        num_inference_steps,
+                        guidance_scale,
+                        unconditioning,
+                        img_noise_0,
+                        img_noise_1,
+                        text_embeddings_0,
+                        text_embeddings_1,
+                        lora_0,
+                        lora_1,
+                        alpha_list[i],
+                        False,
+                        fix_lora
+                    )
+                    image = self.latent2image(latents)
+                    image = Image.fromarray(image)
+                    if save:
+                        image.save(f"{self.output_path}/{i:02d}.png")
+                    images.append(image)
+                images = [first_image] + images + [last_image]
+            else:
+                for k, alpha in enumerate(alpha_list):
+                    latents = self.cal_latent(
+                        num_inference_steps,
+                        guidance_scale,
+                        unconditioning,
+                        img_noise_0,
+                        img_noise_1,
+                        text_embeddings_0,
+                        text_embeddings_1,
+                        lora_0,
+                        lora_1,
+                        alpha_list[k],
+                        self.use_lora,
+                        fix_lora
+                    )
+                    image = self.latent2image(latents)
+                    image = Image.fromarray(image)
+                    if save:
+                        image.save(f"{self.output_path}/{k:02d}.png")
+                    images.append(image)
+            return images
+        with torch.no_grad():
+            if self.use_reschedule:
+                alpha_scheduler = AlphaScheduler()
+                alpha_list = list(torch.linspace(0, 1, num_frames))
+                images_pt = morph(alpha_list, progress, "Sampling...", False)
+                images_pt = [transforms.ToTensor()(img).unsqueeze(0)
+                             for img in images_pt]
+                alpha_scheduler.from_imgs(images_pt)
+                alpha_list = alpha_scheduler.get_list()
+                print(alpha_list)
+                images = morph(alpha_list, progress, "Reschedule...", False)
+            else:
+                alpha_list = list(torch.linspace(0, 1, num_frames))
+                print(alpha_list)
+                images = morph(alpha_list, progress, "Sampling...", False)
+        return images
+# os.makedirs(self.output_path, exist_ok=True)
+# pipeline = DiffMorpherPipeline.from_pretrained(
+#     "./stabilityai/stable-diffusion-2-1-base", torch_dtype=torch.float32)
+# pipeline.to("cuda")
+# images = pipeline(
+#     args.image_path_0,
+#     args.image_path_1,
+#     args.prompt_0,
+#     args.prompt_1
+# )
+# images[0].save(f"{self.output_path}/output.gif", save_all=True,
+#                append_images=images[1:], duration=args.duration, loop=0)

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+accelerate==0.23.0
+diffusers==0.17.1
+einops==0.7.0
+# gradio==4.7.1
+numpy==1.26.1
+opencv_python==4.5.5.64
+packaging==23.2
+Pillow==10.1.0
+safetensors==0.4.0
+torch
+torchvision
+tqdm==4.65.0
+transformers==4.34.1
+lpips