Spaces:

video-p2p-library
/

Video-P2P-Demo

Runtime error

App Files Files Community

ShaoTengLiu commited on Mar 20, 2023

Commit

44fa1db

1 Parent(s): ab4a868

update

Browse files

Files changed (4) hide show

Video-P2P/run.py +0 -993
Video-P2P/run_tuning.py +30 -5
Video-P2P/run_videop2p.py +106 -69
trainer.py +3 -1

Video-P2P/run.py DELETED Viewed

@@ -1,993 +0,0 @@
-import argparse
-import datetime
-import logging
-import inspect
-import math
-import os
-from typing import Optional, Union, Tuple, List, Callable, Dict
-from omegaconf import OmegaConf
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-import diffusers
-import transformers
-from accelerate import Accelerator
-from accelerate.logging import get_logger
-from accelerate.utils import set_seed
-from diffusers import AutoencoderKL, DDPMScheduler, DDIMScheduler
-from diffusers.optimization import get_scheduler
-from diffusers.utils import check_min_version
-from diffusers.utils.import_utils import is_xformers_available
-from tqdm.auto import tqdm
-from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer
-from tuneavideo.models.unet import UNet3DConditionModel
-from tuneavideo.data.dataset import TuneAVideoDataset
-from tuneavideo.pipelines.pipeline_tuneavideo import TuneAVideoPipeline
-from tuneavideo.util import save_videos_grid, ddim_inversion
-from einops import rearrange
-import cv2
-import abc
-import ptp_utils
-import seq_aligner
-import shutil
-from torch.optim.adam import Adam
-from PIL import Image
-import numpy as np
-import decord
-decord.bridge.set_bridge('torch')
-# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.10.0.dev0")
-logger = get_logger(__name__, log_level="INFO")
-def main(
-    pretrained_model_path: str,
-    output_dir: str,
-    train_data: Dict,
-    validation_data: Dict,
-    validation_steps: int = 100,
-    trainable_modules: Tuple[str] = (
-        "attn1.to_q",
-        "attn2.to_q",
-        "attn_temp",
-    ),
-    train_batch_size: int = 1,
-    max_train_steps: int = 500,
-    learning_rate: float = 3e-5,
-    scale_lr: bool = False,
-    lr_scheduler: str = "constant",
-    lr_warmup_steps: int = 0,
-    adam_beta1: float = 0.9,
-    adam_beta2: float = 0.999,
-    adam_weight_decay: float = 1e-2,
-    adam_epsilon: float = 1e-08,
-    max_grad_norm: float = 1.0,
-    gradient_accumulation_steps: int = 1,
-    gradient_checkpointing: bool = True,
-    checkpointing_steps: int = 500,
-    resume_from_checkpoint: Optional[str] = None,
-    mixed_precision: Optional[str] = "fp16",
-    use_8bit_adam: bool = False,
-    enable_xformers_memory_efficient_attention: bool = True,
-    seed: Optional[int] = None,
-    # pretrained_model_path: str,
-    # image_path: str = None,
-    # prompt: str = None,
-    prompts: Tuple[str] = None,
-    eq_params: Dict = None,
-    save_name: str = None,
-    is_word_swap: bool = None,
-    blend_word: Tuple[str] = None,
-    cross_replace_steps: float = 0.2,
-    self_replace_steps: float = 0.5,
-    video_len: int = 8,
-    fast: bool = False,
-    mixed_precision_p2p: str = 'fp32',
-):
-    *_, config = inspect.getargvalues(inspect.currentframe())
-    accelerator = Accelerator(
-        gradient_accumulation_steps=gradient_accumulation_steps,
-        mixed_precision=mixed_precision,
-    )
-    # Make one log on every process with the configuration for debugging.
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO,
-    )
-    logger.info(accelerator.state, main_process_only=False)
-    if accelerator.is_local_main_process:
-        transformers.utils.logging.set_verbosity_warning()
-        diffusers.utils.logging.set_verbosity_info()
-    else:
-        transformers.utils.logging.set_verbosity_error()
-        diffusers.utils.logging.set_verbosity_error()
-    # If passed along, set the training seed now.
-    if seed is not None:
-        set_seed(seed)
-    # Handle the output folder creation
-    if accelerator.is_main_process:
-        # now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
-        # output_dir = os.path.join(output_dir, now)
-        os.makedirs(output_dir, exist_ok=True)
-        os.makedirs(f"{output_dir}/samples", exist_ok=True)
-        os.makedirs(f"{output_dir}/inv_latents", exist_ok=True)
-        OmegaConf.save(config, os.path.join(output_dir, 'config.yaml'))
-    # Load scheduler, tokenizer and models.
-    noise_scheduler = DDPMScheduler.from_pretrained(pretrained_model_path, subfolder="scheduler")
-    tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
-    text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder")
-    vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae")
-    unet = UNet3DConditionModel.from_pretrained_2d(pretrained_model_path, subfolder="unet")
-    # Freeze vae and text_encoder
-    vae.requires_grad_(False)
-    text_encoder.requires_grad_(False)
-    unet.requires_grad_(False)
-    for name, module in unet.named_modules():
-        if name.endswith(tuple(trainable_modules)):
-            for params in module.parameters():
-                params.requires_grad = True
-    if enable_xformers_memory_efficient_attention:
-        if is_xformers_available():
-            unet.enable_xformers_memory_efficient_attention()
-        else:
-            raise ValueError("xformers is not available. Make sure it is installed correctly")
-    if gradient_checkpointing:
-        unet.enable_gradient_checkpointing()
-    if scale_lr:
-        learning_rate = (
-            learning_rate * gradient_accumulation_steps * train_batch_size * accelerator.num_processes
-        )
-    # Initialize the optimizer
-    if use_8bit_adam:
-        try:
-            import bitsandbytes as bnb
-        except ImportError:
-            raise ImportError(
-                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
-            )
-        optimizer_cls = bnb.optim.AdamW8bit
-    else:
-        optimizer_cls = torch.optim.AdamW
-    optimizer = optimizer_cls(
-        unet.parameters(),
-        lr=learning_rate,
-        betas=(adam_beta1, adam_beta2),
-        weight_decay=adam_weight_decay,
-        eps=adam_epsilon,
-    )
-    # Get the training dataset
-    train_dataset = TuneAVideoDataset(**train_data)
-    # Preprocessing the dataset
-    train_dataset.prompt_ids = tokenizer(
-        train_dataset.prompt, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
-    ).input_ids[0]
-    # DataLoaders creation:
-    train_dataloader = torch.utils.data.DataLoader(
-        train_dataset, batch_size=train_batch_size
-    )
-    # Get the validation pipeline
-    validation_pipeline = TuneAVideoPipeline(
-        vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet,
-        scheduler=DDIMScheduler.from_pretrained(pretrained_model_path, subfolder="scheduler")
-    )
-    validation_pipeline.enable_vae_slicing()
-    ddim_inv_scheduler = DDIMScheduler.from_pretrained(pretrained_model_path, subfolder='scheduler')
-    ddim_inv_scheduler.set_timesteps(validation_data.num_inv_steps)
-    # Scheduler
-    lr_scheduler = get_scheduler(
-        lr_scheduler,
-        optimizer=optimizer,
-        num_warmup_steps=lr_warmup_steps * gradient_accumulation_steps,
-        num_training_steps=max_train_steps * gradient_accumulation_steps,
-    )
-    # Prepare everything with our `accelerator`.
-    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-        unet, optimizer, train_dataloader, lr_scheduler
-    )
-    # For mixed precision training we cast the text_encoder and vae weights to half-precision
-    # as these models are only used for inference, keeping weights in full precision is not required.
-    weight_dtype = torch.float32
-    if accelerator.mixed_precision == "fp16":
-        weight_dtype = torch.float16
-    elif accelerator.mixed_precision == "bf16":
-        weight_dtype = torch.bfloat16
-    # Move text_encode and vae to gpu and cast to weight_dtype
-    text_encoder.to(accelerator.device, dtype=weight_dtype)
-    vae.to(accelerator.device, dtype=weight_dtype)
-    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
-    # Afterwards we recalculate our number of training epochs
-    num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)
-    # We need to initialize the trackers we use, and also store our configuration.
-    # The trackers initializes automatically on the main process.
-    if accelerator.is_main_process:
-        accelerator.init_trackers("text2video-fine-tune")
-    # Train!
-    total_batch_size = train_batch_size * accelerator.num_processes * gradient_accumulation_steps
-    logger.info("***** Running training *****")
-    logger.info(f"  Num examples = {len(train_dataset)}")
-    logger.info(f"  Num Epochs = {num_train_epochs}")
-    logger.info(f"  Instantaneous batch size per device = {train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
-    logger.info(f"  Gradient Accumulation steps = {gradient_accumulation_steps}")
-    logger.info(f"  Total optimization steps = {max_train_steps}")
-    global_step = 0
-    first_epoch = 0
-    # Potentially load in the weights and states from a previous save
-    if resume_from_checkpoint:
-        if resume_from_checkpoint != "latest":
-            path = os.path.basename(resume_from_checkpoint)
-        else:
-            # Get the most recent checkpoint
-            dirs = os.listdir(output_dir)
-            dirs = [d for d in dirs if d.startswith("checkpoint")]
-            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
-            path = dirs[-1]
-        accelerator.print(f"Resuming from checkpoint {path}")
-        accelerator.load_state(os.path.join(output_dir, path))
-        global_step = int(path.split("-")[1])
-        first_epoch = global_step // num_update_steps_per_epoch
-        resume_step = global_step % num_update_steps_per_epoch
-    # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(global_step, max_train_steps), disable=not accelerator.is_local_main_process)
-    progress_bar.set_description("Steps")
-    for epoch in range(first_epoch, num_train_epochs):
-        unet.train()
-        train_loss = 0.0
-        for step, batch in enumerate(train_dataloader):
-            # Skip steps until we reach the resumed step
-            if resume_from_checkpoint and epoch == first_epoch and step < resume_step:
-                if step % gradient_accumulation_steps == 0:
-                    progress_bar.update(1)
-                continue
-            with accelerator.accumulate(unet):
-                # Convert videos to latent space
-                pixel_values = batch["pixel_values"].to(weight_dtype)
-                video_length = pixel_values.shape[1]
-                pixel_values = rearrange(pixel_values, "b f c h w -> (b f) c h w")
-                latents = vae.encode(pixel_values).latent_dist.sample()
-                latents = rearrange(latents, "(b f) c h w -> b c f h w", f=video_length)
-                latents = latents * 0.18215
-                # Sample noise that we'll add to the latents
-                noise = torch.randn_like(latents)
-                bsz = latents.shape[0]
-                # Sample a random timestep for each video
-                timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
-                timesteps = timesteps.long()
-                # Add noise to the latents according to the noise magnitude at each timestep
-                # (this is the forward diffusion process)
-                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
-                # Get the text embedding for conditioning
-                encoder_hidden_states = text_encoder(batch["prompt_ids"])[0]
-                # Get the target for loss depending on the prediction type
-                if noise_scheduler.prediction_type == "epsilon":
-                    target = noise
-                elif noise_scheduler.prediction_type == "v_prediction":
-                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
-                else:
-                    raise ValueError(f"Unknown prediction type {noise_scheduler.prediction_type}")
-                # Predict the noise residual and compute loss
-                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
-                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
-                # Gather the losses across all processes for logging (if we use distributed training).
-                avg_loss = accelerator.gather(loss.repeat(train_batch_size)).mean()
-                train_loss += avg_loss.item() / gradient_accumulation_steps
-                # Backpropagate
-                accelerator.backward(loss)
-                if accelerator.sync_gradients:
-                    accelerator.clip_grad_norm_(unet.parameters(), max_grad_norm)
-                optimizer.step()
-                lr_scheduler.step()
-                optimizer.zero_grad()
-            # Checks if the accelerator has performed an optimization step behind the scenes
-            if accelerator.sync_gradients:
-                progress_bar.update(1)
-                global_step += 1
-                accelerator.log({"train_loss": train_loss}, step=global_step)
-                train_loss = 0.0
-                if global_step % checkpointing_steps == 0:
-                    if accelerator.is_main_process:
-                        save_path = os.path.join(output_dir, f"checkpoint-{global_step}")
-                        accelerator.save_state(save_path)
-                        logger.info(f"Saved state to {save_path}")
-                if global_step % validation_steps == 0:
-                    if accelerator.is_main_process:
-                        samples = []
-                        generator = torch.Generator(device=latents.device)
-                        generator.manual_seed(seed)
-                        ddim_inv_latent = None
-                        if validation_data.use_inv_latent:
-                            inv_latents_path = os.path.join(output_dir, f"inv_latents/ddim_latent-{global_step}.pt")
-                            ddim_inv_latent = ddim_inversion(
-                                validation_pipeline, ddim_inv_scheduler, video_latent=latents,
-                                num_inv_steps=validation_data.num_inv_steps, prompt="")[-1].to(weight_dtype)
-                            torch.save(ddim_inv_latent, inv_latents_path)
-                        for idx, prompt in enumerate(validation_data.prompts):
-                            sample = validation_pipeline(prompt, generator=generator, latents=ddim_inv_latent,
-                                                         **validation_data).videos
-                            save_videos_grid(sample, f"{output_dir}/samples/sample-{global_step}/{prompt}.gif")
-                            samples.append(sample)
-                        samples = torch.concat(samples)
-                        save_path = f"{output_dir}/samples/sample-{global_step}.gif"
-                        save_videos_grid(samples, save_path)
-                        logger.info(f"Saved samples to {save_path}")
-            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
-            progress_bar.set_postfix(**logs)
-            if global_step >= max_train_steps:
-                break
-    # Create the pipeline using the trained modules and save it.
-    accelerator.wait_for_everyone()
-    if accelerator.is_main_process:
-        unet = accelerator.unwrap_model(unet)
-        pipeline = TuneAVideoPipeline.from_pretrained(
-            pretrained_model_path,
-            text_encoder=text_encoder,
-            vae=vae,
-            unet=unet,
-        )
-        pipeline.save_pretrained(output_dir)
-    accelerator.end_training()
-    torch.cuda.empty_cache()
-    # Video-P2P
-    scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
-    MY_TOKEN = ''
-    LOW_RESOURCE = False
-    NUM_DDIM_STEPS = 50
-    GUIDANCE_SCALE = 7.5
-    MAX_NUM_WORDS = 77
-    device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
-    # need to adjust sometimes
-    mask_th = (.3, .3)
-    pretrained_model_path = output_dir
-    image_path = train_data['video_path']
-    prompt = train_data['prompt']
-    # prompts = [prompt, ]
-    output_folder = os.path.join(pretrained_model_path, 'results')
-    if fast:
-        save_name_1 = os.path.join(output_folder, 'inversion_fast.gif')
-        save_name_2 = os.path.join(output_folder, '{}_fast.gif'.format(save_name))
-    else:
-        save_name_1 = os.path.join(output_folder, 'inversion.gif')
-        save_name_2 = os.path.join(output_folder, '{}.gif'.format(save_name))
-    if blend_word:
-        blend_word = (((blend_word[0],), (blend_word[1],)))
-    eq_params = dict(eq_params)
-    prompts = list(prompts)
-    cross_replace_steps = {'default_': cross_replace_steps,}
-    weight_dtype = torch.float32
-    if mixed_precision_p2p == "fp16":
-        weight_dtype = torch.float16
-    elif mixed_precision_p2p == "bf16":
-        weight_dtype = torch.bfloat16
-    if not os.path.exists(output_folder):
-        os.makedirs(output_folder)
-    # Load the tokenizer
-    tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_path, subfolder="tokenizer")
-    # Load models and create wrapper for stable diffusion
-    text_encoder = CLIPTextModel.from_pretrained(
-        pretrained_model_path,
-        subfolder="text_encoder",
-    ).to(device, dtype=weight_dtype)
-    vae = AutoencoderKL.from_pretrained(
-        pretrained_model_path,
-        subfolder="vae",
-    ).to(device, dtype=weight_dtype)
-    unet = UNet3DConditionModel.from_pretrained(
-        pretrained_model_path, subfolder="unet"
-    ).to(device)
-    ldm_stable = TuneAVideoPipeline(
-        vae=vae,
-        text_encoder=text_encoder,
-        tokenizer=tokenizer,
-        unet=unet,
-        scheduler=scheduler,
-    ).to(device)
-    try:
-        ldm_stable.disable_xformers_memory_efficient_attention()
-    except AttributeError:
-        print("Attribute disable_xformers_memory_efficient_attention() is missing")
-    tokenizer = ldm_stable.tokenizer # Tokenizer of class: [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer)
-    # A tokenizer breaks a stream of text into tokens, usually by looking for whitespace (tabs, spaces, new lines).
-    class LocalBlend:
-        def get_mask(self, maps, alpha, use_pool):
-            k = 1
-            maps = (maps * alpha).sum(-1).mean(2)
-            if use_pool:
-                maps = F.max_pool2d(maps, (k * 2 + 1, k * 2 +1), (1, 1), padding=(k, k))
-            mask = F.interpolate(maps, size=(x_t.shape[3:]))
-            mask = mask / mask.max(2, keepdims=True)[0].max(3, keepdims=True)[0]
-            mask = mask.gt(self.th[1-int(use_pool)])
-            mask = mask[:1] + mask
-            return mask
-        def __call__(self, x_t, attention_store, step):
-            self.counter += 1
-            if self.counter > self.start_blend:
-                maps = attention_store["down_cross"][2:4] + attention_store["up_cross"][:3]
-                maps = [item.reshape(self.alpha_layers.shape[0], -1, 8, 16, 16, MAX_NUM_WORDS) for item in maps]
-                maps = torch.cat(maps, dim=2)
-                mask = self.get_mask(maps, self.alpha_layers, True)
-                if self.substruct_layers is not None:
-                    maps_sub = ~self.get_mask(maps, self.substruct_layers, False)
-                    mask = mask * maps_sub
-                mask = mask.float()
-                mask = mask.reshape(-1, 1, mask.shape[-3], mask.shape[-2], mask.shape[-1])
-                x_t = x_t[:1] + mask * (x_t - x_t[:1])
-            return x_t
-        def __init__(self, prompts: List[str], words: [List[List[str]]], substruct_words=None, start_blend=0.2, th=(.3, .3)):
-            alpha_layers = torch.zeros(len(prompts),  1, 1, 1, 1, MAX_NUM_WORDS)
-            for i, (prompt, words_) in enumerate(zip(prompts, words)):
-                if type(words_) is str:
-                    words_ = [words_]
-                for word in words_:
-                    ind = ptp_utils.get_word_inds(prompt, word, tokenizer)
-                    alpha_layers[i, :, :, :, :, ind] = 1
-            if substruct_words is not None:
-                substruct_layers = torch.zeros(len(prompts),  1, 1, 1, 1, MAX_NUM_WORDS)
-                for i, (prompt, words_) in enumerate(zip(prompts, substruct_words)):
-                    if type(words_) is str:
-                        words_ = [words_]
-                    for word in words_:
-                        ind = ptp_utils.get_word_inds(prompt, word, tokenizer)
-                        substruct_layers[i, :, :, :, :, ind] = 1
-                self.substruct_layers = substruct_layers.to(device)
-            else:
-                self.substruct_layers = None
-            self.alpha_layers = alpha_layers.to(device)
-            self.start_blend = int(start_blend * NUM_DDIM_STEPS)
-            self.counter = 0
-            self.th=th
-    class EmptyControl:
-        def step_callback(self, x_t):
-            return x_t
-        def between_steps(self):
-            return
-        def __call__(self, attn, is_cross: bool, place_in_unet: str):
-            return attn
-    class AttentionControl(abc.ABC):
-        def step_callback(self, x_t):
-            return x_t
-        def between_steps(self):
-            return
-        @property
-        def num_uncond_att_layers(self):
-            return self.num_att_layers if LOW_RESOURCE else 0
-        @abc.abstractmethod
-        def forward (self, attn, is_cross: bool, place_in_unet: str):
-            raise NotImplementedError
-        def __call__(self, attn, is_cross: bool, place_in_unet: str):
-            if self.cur_att_layer >= self.num_uncond_att_layers:
-                if LOW_RESOURCE:
-                    attn = self.forward(attn, is_cross, place_in_unet)
-                else:
-                    h = attn.shape[0]
-                    attn[h // 2:] = self.forward(attn[h // 2:], is_cross, place_in_unet)
-            self.cur_att_layer += 1
-            if self.cur_att_layer == self.num_att_layers + self.num_uncond_att_layers:
-                self.cur_att_layer = 0
-                self.cur_step += 1
-                self.between_steps()
-            return attn
-        def reset(self):
-            self.cur_step = 0
-            self.cur_att_layer = 0
-        def __init__(self):
-            self.cur_step = 0
-            self.num_att_layers = -1
-            self.cur_att_layer = 0
-    class SpatialReplace(EmptyControl):
-        def step_callback(self, x_t):
-            if self.cur_step < self.stop_inject:
-                b = x_t.shape[0]
-                x_t = x_t[:1].expand(b, *x_t.shape[1:])
-            return x_t
-        def __init__(self, stop_inject: float):
-            super(SpatialReplace, self).__init__()
-            self.stop_inject = int((1 - stop_inject) * NUM_DDIM_STEPS)
-    class AttentionStore(AttentionControl):
-        @staticmethod
-        def get_empty_store():
-            return {"down_cross": [], "mid_cross": [], "up_cross": [],
-                    "down_self": [],  "mid_self": [],  "up_self": []}
-        def forward(self, attn, is_cross: bool, place_in_unet: str):
-            key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
-            if attn.shape[1] <= 32 ** 2:
-                self.step_store[key].append(attn)
-            return attn
-        def between_steps(self):
-            if len(self.attention_store) == 0:
-                self.attention_store = self.step_store
-            else:
-                for key in self.attention_store:
-                    for i in range(len(self.attention_store[key])):
-                        self.attention_store[key][i] += self.step_store[key][i]
-            self.step_store = self.get_empty_store()
-        def get_average_attention(self):
-            average_attention = {key: [item / self.cur_step for item in self.attention_store[key]] for key in self.attention_store}
-            return average_attention
-        def reset(self):
-            super(AttentionStore, self).reset()
-            self.step_store = self.get_empty_store()
-            self.attention_store = {}
-        def __init__(self):
-            super(AttentionStore, self).__init__()
-            self.step_store = self.get_empty_store()
-            self.attention_store = {}
-    class AttentionControlEdit(AttentionStore, abc.ABC):
-        def step_callback(self, x_t):
-            if self.local_blend is not None:
-                x_t = self.local_blend(x_t, self.attention_store, self.cur_step)
-            return x_t
-        def replace_self_attention(self, attn_base, att_replace, place_in_unet):
-            if att_replace.shape[2] <= 32 ** 2:
-                attn_base = attn_base.unsqueeze(0).expand(att_replace.shape[0], *attn_base.shape)
-                return attn_base
-            else:
-                return att_replace
-        @abc.abstractmethod
-        def replace_cross_attention(self, attn_base, att_replace):
-            raise NotImplementedError
-        def forward(self, attn, is_cross: bool, place_in_unet: str):
-            super(AttentionControlEdit, self).forward(attn, is_cross, place_in_unet)
-            if is_cross or (self.num_self_replace[0] <= self.cur_step < self.num_self_replace[1]):
-                h = attn.shape[0] // (self.batch_size)
-                attn = attn.reshape(self.batch_size, h, *attn.shape[1:])
-                attn_base, attn_repalce = attn[0], attn[1:]
-                if is_cross:
-                    alpha_words = self.cross_replace_alpha[self.cur_step]
-                    attn_repalce_new = self.replace_cross_attention(attn_base, attn_repalce) * alpha_words + (1 - alpha_words) * attn_repalce
-                    attn[1:] = attn_repalce_new
-                else:
-                    attn[1:] = self.replace_self_attention(attn_base, attn_repalce, place_in_unet)
-                attn = attn.reshape(self.batch_size * h, *attn.shape[2:])
-            return attn
-        def __init__(self, prompts, num_steps: int,
-                    cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]],
-                    self_replace_steps: Union[float, Tuple[float, float]],
-                    local_blend: Optional[LocalBlend]):
-            super(AttentionControlEdit, self).__init__()
-            self.batch_size = len(prompts)
-            self.cross_replace_alpha = ptp_utils.get_time_words_attention_alpha(prompts, num_steps, cross_replace_steps, tokenizer).to(device)
-            if type(self_replace_steps) is float:
-                self_replace_steps = 0, self_replace_steps
-            self.num_self_replace = int(num_steps * self_replace_steps[0]), int(num_steps * self_replace_steps[1])
-            self.local_blend = local_blend
-    class AttentionReplace(AttentionControlEdit):
-        def replace_cross_attention(self, attn_base, att_replace):
-            return torch.einsum('hpw,bwn->bhpn', attn_base, self.mapper)
-        def __init__(self, prompts, num_steps: int, cross_replace_steps: float, self_replace_steps: float,
-                    local_blend: Optional[LocalBlend] = None):
-            super(AttentionReplace, self).__init__(prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend)
-            self.mapper = seq_aligner.get_replacement_mapper(prompts, tokenizer).to(device)
-    class AttentionRefine(AttentionControlEdit):
-        def replace_cross_attention(self, attn_base, att_replace):
-            attn_base_replace = attn_base[:, :, self.mapper].permute(2, 0, 1, 3)
-            attn_replace = attn_base_replace * self.alphas + att_replace * (1 - self.alphas)
-            return attn_replace
-        def __init__(self, prompts, num_steps: int, cross_replace_steps: float, self_replace_steps: float,
-                    local_blend: Optional[LocalBlend] = None):
-            super(AttentionRefine, self).__init__(prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend)
-            self.mapper, alphas = seq_aligner.get_refinement_mapper(prompts, tokenizer)
-            self.mapper, alphas = self.mapper.to(device), alphas.to(device)
-            self.alphas = alphas.reshape(alphas.shape[0], 1, 1, alphas.shape[1])
-    class AttentionReweight(AttentionControlEdit):
-        def replace_cross_attention(self, attn_base, att_replace):
-            if self.prev_controller is not None:
-                attn_base = self.prev_controller.replace_cross_attention(attn_base, att_replace)
-            attn_replace = attn_base[None, :, :, :] * self.equalizer[:, None, None, :]
-            return attn_replace
-        def __init__(self, prompts, num_steps: int, cross_replace_steps: float, self_replace_steps: float, equalizer,
-                    local_blend: Optional[LocalBlend] = None, controller: Optional[AttentionControlEdit] = None):
-            super(AttentionReweight, self).__init__(prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend)
-            self.equalizer = equalizer.to(device)
-            self.prev_controller = controller
-    def get_equalizer(text: str, word_select: Union[int, Tuple[int, ...]], values: Union[List[float],
-                    Tuple[float, ...]]):
-        if type(word_select) is int or type(word_select) is str:
-            word_select = (word_select,)
-        equalizer = torch.ones(1, 77)
-        for word, val in zip(word_select, values):
-            inds = ptp_utils.get_word_inds(text, word, tokenizer)
-            equalizer[:, inds] = val
-        return equalizer
-    def aggregate_attention(attention_store: AttentionStore, res: int, from_where: List[str], is_cross: bool, select: int):
-        out = []
-        attention_maps = attention_store.get_average_attention()
-        num_pixels = res ** 2
-        for location in from_where:
-            for item in attention_maps[f"{location}_{'cross' if is_cross else 'self'}"]:
-                if item.shape[1] == num_pixels:
-                    cross_maps = item.reshape(8, 8, res, res, item.shape[-1])
-                    out.append(cross_maps)
-        out = torch.cat(out, dim=1)
-        out = out.sum(1) / out.shape[1]
-        return out.cpu()
-    def make_controller(prompts: List[str], is_replace_controller: bool, cross_replace_steps: Dict[str, float], self_replace_steps: float, blend_words=None, equilizer_params=None, mask_th=(.3,.3)) -> AttentionControlEdit:
-        if blend_words is None:
-            lb = None
-        else:
-            lb = LocalBlend(prompts, blend_word, th=mask_th)
-        if is_replace_controller:
-            controller = AttentionReplace(prompts, NUM_DDIM_STEPS, cross_replace_steps=cross_replace_steps, self_replace_steps=self_replace_steps, local_blend=lb)
-        else:
-            controller = AttentionRefine(prompts, NUM_DDIM_STEPS, cross_replace_steps=cross_replace_steps, self_replace_steps=self_replace_steps, local_blend=lb)
-        if equilizer_params is not None:
-            eq = get_equalizer(prompts[1], equilizer_params["words"], equilizer_params["values"])
-            controller = AttentionReweight(prompts, NUM_DDIM_STEPS, cross_replace_steps=cross_replace_steps,
-                                        self_replace_steps=self_replace_steps, equalizer=eq, local_blend=lb, controller=controller)
-        return controller
-    def load_512_seq(image_path, left=0, right=0, top=0, bottom=0, n_sample_frame=video_len, sampling_rate=1):
-        vr = decord.VideoReader(image_path, width=512, height=512)
-        sample_index = list(range(0, len(vr), sampling_rate))[:n_sample_frame]
-        video = vr.get_batch(sample_index)
-        return video.numpy()
-    class NullInversion:
-        def prev_step(self, model_output: Union[torch.FloatTensor, np.ndarray], timestep: int, sample: Union[torch.FloatTensor, np.ndarray]):
-            prev_timestep = timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
-            alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
-            alpha_prod_t_prev = self.scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.scheduler.final_alpha_cumprod
-            beta_prod_t = 1 - alpha_prod_t
-            pred_original_sample = (sample - beta_prod_t ** 0.5 * model_output) / alpha_prod_t ** 0.5
-            pred_sample_direction = (1 - alpha_prod_t_prev) ** 0.5 * model_output
-            prev_sample = alpha_prod_t_prev ** 0.5 * pred_original_sample + pred_sample_direction
-            return prev_sample
-        def next_step(self, model_output: Union[torch.FloatTensor, np.ndarray], timestep: int, sample: Union[torch.FloatTensor, np.ndarray]):
-            timestep, next_timestep = min(timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps, 999), timestep
-            alpha_prod_t = self.scheduler.alphas_cumprod[timestep] if timestep >= 0 else self.scheduler.final_alpha_cumprod
-            alpha_prod_t_next = self.scheduler.alphas_cumprod[next_timestep]
-            beta_prod_t = 1 - alpha_prod_t
-            next_original_sample = (sample - beta_prod_t ** 0.5 * model_output) / alpha_prod_t ** 0.5
-            next_sample_direction = (1 - alpha_prod_t_next) ** 0.5 * model_output
-            next_sample = alpha_prod_t_next ** 0.5 * next_original_sample + next_sample_direction
-            return next_sample
-        def get_noise_pred_single(self, latents, t, context):
-            noise_pred = self.model.unet(latents, t, encoder_hidden_states=context)["sample"]
-            return noise_pred
-        def get_noise_pred(self, latents, t, is_forward=True, context=None):
-            latents_input = torch.cat([latents] * 2)
-            if context is None:
-                context = self.context
-            guidance_scale = 1 if is_forward else GUIDANCE_SCALE
-            noise_pred = self.model.unet(latents_input, t, encoder_hidden_states=context)["sample"]
-            noise_pred_uncond, noise_prediction_text = noise_pred.chunk(2)
-            noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
-            if is_forward:
-                latents = self.next_step(noise_pred, t, latents)
-            else:
-                latents = self.prev_step(noise_pred, t, latents)
-            return latents
-        @torch.no_grad()
-        def latent2image(self, latents, return_type='np'):
-            latents = 1 / 0.18215 * latents.detach()
-            image = self.model.vae.decode(latents)['sample']
-            if return_type == 'np':
-                image = (image / 2 + 0.5).clamp(0, 1)
-                image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
-                image = (image * 255).astype(np.uint8)
-            return image
-        @torch.no_grad()
-        def latent2image_video(self, latents, return_type='np'):
-            latents = 1 / 0.18215 * latents.detach()
-            latents = latents[0].permute(1, 0, 2, 3)
-            image = self.model.vae.decode(latents)['sample']
-            if return_type == 'np':
-                image = (image / 2 + 0.5).clamp(0, 1)
-                image = image.cpu().permute(0, 2, 3, 1).numpy()
-                image = (image * 255).astype(np.uint8)
-            return image
-        @torch.no_grad()
-        def image2latent(self, image):
-            with torch.no_grad():
-                if type(image) is Image:
-                    image = np.array(image)
-                if type(image) is torch.Tensor and image.dim() == 4:
-                    latents = image
-                else:
-                    image = torch.from_numpy(image).float() / 127.5 - 1
-                    image = image.permute(2, 0, 1).unsqueeze(0).to(device, dtype=weight_dtype)
-                    latents = self.model.vae.encode(image)['latent_dist'].mean
-                    latents = latents * 0.18215
-            return latents
-        @torch.no_grad()
-        def image2latent_video(self, image):
-            with torch.no_grad():
-                image = torch.from_numpy(image).float() / 127.5 - 1
-                image = image.permute(0, 3, 1, 2).to(device).to(device, dtype=weight_dtype)
-                latents = self.model.vae.encode(image)['latent_dist'].mean
-                latents = rearrange(latents, "(b f) c h w -> b c f h w", b=1)
-                latents = latents * 0.18215
-            return latents
-        @torch.no_grad()
-        def init_prompt(self, prompt: str):
-            uncond_input = self.model.tokenizer(
-                [""], padding="max_length", max_length=self.model.tokenizer.model_max_length,
-                return_tensors="pt"
-            )
-            uncond_embeddings = self.model.text_encoder(uncond_input.input_ids.to(self.model.device))[0]
-            text_input = self.model.tokenizer(
-                [prompt],
-                padding="max_length",
-                max_length=self.model.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_embeddings = self.model.text_encoder(text_input.input_ids.to(self.model.device))[0]
-            self.context = torch.cat([uncond_embeddings, text_embeddings])
-            self.prompt = prompt
-        @torch.no_grad()
-        def ddim_loop(self, latent):
-            uncond_embeddings, cond_embeddings = self.context.chunk(2)
-            all_latent = [latent]
-            latent = latent.clone().detach()
-            for i in range(NUM_DDIM_STEPS):
-                t = self.model.scheduler.timesteps[len(self.model.scheduler.timesteps) - i - 1]
-                noise_pred = self.get_noise_pred_single(latent, t, cond_embeddings)
-                latent = self.next_step(noise_pred, t, latent)
-                all_latent.append(latent)
-            return all_latent
-        @property
-        def scheduler(self):
-            return self.model.scheduler
-        @torch.no_grad()
-        def ddim_inversion(self, image):
-            latent = self.image2latent_video(image)
-            image_rec = self.latent2image_video(latent)
-            ddim_latents = self.ddim_loop(latent)
-            return image_rec, ddim_latents
-        def null_optimization(self, latents, num_inner_steps, epsilon):
-            uncond_embeddings, cond_embeddings = self.context.chunk(2)
-            uncond_embeddings_list = []
-            latent_cur = latents[-1]
-            # bar = tqdm(total=num_inner_steps * NUM_DDIM_STEPS)
-            for i in range(NUM_DDIM_STEPS):
-                uncond_embeddings = uncond_embeddings.clone().detach()
-                uncond_embeddings.requires_grad = True
-                optimizer = Adam([uncond_embeddings], lr=1e-2 * (1. - i / 100.))
-                latent_prev = latents[len(latents) - i - 2]
-                t = self.model.scheduler.timesteps[i]
-                with torch.no_grad():
-                    noise_pred_cond = self.get_noise_pred_single(latent_cur, t, cond_embeddings)
-                for j in range(num_inner_steps):
-                    noise_pred_uncond = self.get_noise_pred_single(latent_cur, t, uncond_embeddings)
-                    noise_pred = noise_pred_uncond + GUIDANCE_SCALE * (noise_pred_cond - noise_pred_uncond)
-                    latents_prev_rec = self.prev_step(noise_pred, t, latent_cur)
-                    loss = F.mse_loss(latents_prev_rec, latent_prev)
-                    optimizer.zero_grad()
-                    loss.backward()
-                    optimizer.step()
-                    loss_item = loss.item()
-                    # bar.update()
-                    if loss_item < epsilon + i * 2e-5:
-                        break
-                # for j in range(j + 1, num_inner_steps):
-                #     bar.update()
-                uncond_embeddings_list.append(uncond_embeddings[:1].detach())
-                with torch.no_grad():
-                    context = torch.cat([uncond_embeddings, cond_embeddings])
-                    latent_cur = self.get_noise_pred(latent_cur, t, False, context)
-            # bar.close()
-            return uncond_embeddings_list
-        def invert(self, image_path: str, prompt: str, offsets=(0,0,0,0), num_inner_steps=10, early_stop_epsilon=1e-5, verbose=False):
-            self.init_prompt(prompt)
-            ptp_utils.register_attention_control(self.model, None)
-            image_gt = load_512_seq(image_path, *offsets)
-            if verbose:
-                print("DDIM inversion...")
-            image_rec, ddim_latents = self.ddim_inversion(image_gt)
-            if verbose:
-                print("Null-text optimization...")
-            uncond_embeddings = self.null_optimization(ddim_latents, num_inner_steps, early_stop_epsilon)
-            return (image_gt, image_rec), ddim_latents[-1], uncond_embeddings
-        def invert_(self, image_path: str, prompt: str, offsets=(0,0,0,0), num_inner_steps=10, early_stop_epsilon=1e-5, verbose=False):
-            self.init_prompt(prompt)
-            ptp_utils.register_attention_control(self.model, None)
-            image_gt = load_512_seq(image_path, *offsets)
-            if verbose:
-                print("DDIM inversion...")
-            image_rec, ddim_latents = self.ddim_inversion(image_gt)
-            if verbose:
-                print("Null-text optimization...")
-            return (image_gt, image_rec), ddim_latents[-1], None
-        def __init__(self, model):
-            scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False,
-                                    set_alpha_to_one=False)
-            self.model = model
-            self.tokenizer = self.model.tokenizer
-            self.model.scheduler.set_timesteps(NUM_DDIM_STEPS)
-            self.prompt = None
-            self.context = None
-    null_inversion = NullInversion(ldm_stable)
-    ###############
-    # Custom APIs:
-    ldm_stable.enable_xformers_memory_efficient_attention()
-    if fast:
-        (image_gt, image_enc), x_t, uncond_embeddings = null_inversion.invert_(image_path, prompt, offsets=(0,0,0,0), verbose=True)
-    else:
-        (image_gt, image_enc), x_t, uncond_embeddings = null_inversion.invert(image_path, prompt, offsets=(0,0,0,0), verbose=True)
-    ##### load uncond #####
-    # uncond_embeddings_load = np.load(uncond_embeddings_path)
-    # uncond_embeddings = []
-    # for i in range(uncond_embeddings_load.shape[0]):
-    #     uncond_embeddings.append(torch.from_numpy(uncond_embeddings_load[i]).to(device))
-    #######################
-    ##### save uncond #####
-    # uncond_embeddings = torch.cat(uncond_embeddings)
-    # uncond_embeddings = uncond_embeddings.cpu().numpy()
-    #######################
-    print("Start Video-P2P!")
-    controller = make_controller(prompts, is_word_swap, cross_replace_steps, self_replace_steps, blend_word, eq_params, mask_th=mask_th)
-    ptp_utils.register_attention_control(ldm_stable, controller)
-    generator = torch.Generator(device=device)
-    with torch.no_grad():
-        sequence = ldm_stable(
-            prompts,
-            generator=generator,
-            latents=x_t,
-            uncond_embeddings_pre=uncond_embeddings,
-            controller = controller,
-            video_length=video_len,
-            fast=fast,
-        ).videos
-    sequence1 = rearrange(sequence[0], "c t h w -> t h w c")
-    sequence2 = rearrange(sequence[1], "c t h w -> t h w c")
-    inversion = []
-    videop2p = []
-    for i in range(sequence1.shape[0]):
-        inversion.append( Image.fromarray((sequence1[i] * 255).numpy().astype(np.uint8)) )
-        videop2p.append( Image.fromarray((sequence2[i] * 255).numpy().astype(np.uint8)) )
-    # inversion[0].save(save_name_1, save_all=True, append_images=inversion[1:], optimize=False, loop=0, duration=250)
-    videop2p[0].save(save_name_2, save_all=True, append_images=videop2p[1:], optimize=False, loop=0, duration=250)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--config", type=str, default="./configs/tuneavideo.yaml")
-    parser.add_argument("--fast", action='store_true')
-    args = parser.parse_args()
-    main(**OmegaConf.load(args.config), fast=args.fast)

Video-P2P/run_tuning.py CHANGED Viewed

@@ -1,12 +1,10 @@
-# From https://github.com/showlab/Tune-A-Video/blob/main/train_tuneavideo.py
 import argparse
 import datetime
 import logging
 import inspect
 import math
 import os
-from typing import Dict, Optional, Tuple
 from omegaconf import OmegaConf
 import torch
@@ -23,7 +21,7 @@ from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version
 from diffusers.utils.import_utils import is_xformers_available
 from tqdm.auto import tqdm
-from transformers import CLIPTextModel, CLIPTokenizer
 from tuneavideo.models.unet import UNet3DConditionModel
 from tuneavideo.data.dataset import TuneAVideoDataset
@@ -31,6 +29,16 @@ from tuneavideo.pipelines.pipeline_tuneavideo import TuneAVideoPipeline
 from tuneavideo.util import save_videos_grid, ddim_inversion
 from einops import rearrange
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
 check_min_version("0.10.0.dev0")
@@ -68,6 +76,19 @@ def main(
     use_8bit_adam: bool = False,
     enable_xformers_memory_efficient_attention: bool = True,
     seed: Optional[int] = None,
 ):
     *_, config = inspect.getargvalues(inspect.currentframe())
@@ -96,6 +117,8 @@ def main(
     # Handle the output folder creation
     if accelerator.is_main_process:
         os.makedirs(output_dir, exist_ok=True)
         os.makedirs(f"{output_dir}/samples", exist_ok=True)
         os.makedirs(f"{output_dir}/inv_latents", exist_ok=True)
@@ -358,10 +381,12 @@ def main(
     accelerator.end_training()
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--config", type=str, default="./configs/tuneavideo.yaml")
     args = parser.parse_args()
-    main(**OmegaConf.load(args.config))

 import argparse
 import datetime
 import logging
 import inspect
 import math
 import os
+from typing import Optional, Union, Tuple, List, Callable, Dict
 from omegaconf import OmegaConf
 import torch
 from diffusers.utils import check_min_version
 from diffusers.utils.import_utils import is_xformers_available
 from tqdm.auto import tqdm
+from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer
 from tuneavideo.models.unet import UNet3DConditionModel
 from tuneavideo.data.dataset import TuneAVideoDataset
 from tuneavideo.util import save_videos_grid, ddim_inversion
 from einops import rearrange
+import cv2
+import abc
+import ptp_utils
+import seq_aligner
+import shutil
+from torch.optim.adam import Adam
+from PIL import Image
+import numpy as np
+import decord
+decord.bridge.set_bridge('torch')
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
 check_min_version("0.10.0.dev0")
     use_8bit_adam: bool = False,
     enable_xformers_memory_efficient_attention: bool = True,
     seed: Optional[int] = None,
+    # pretrained_model_path: str,
+    # image_path: str = None,
+    # prompt: str = None,
+    prompts: Tuple[str] = None,
+    eq_params: Dict = None,
+    save_name: str = None,
+    is_word_swap: bool = None,
+    blend_word: Tuple[str] = None,
+    cross_replace_steps: float = 0.2,
+    self_replace_steps: float = 0.5,
+    video_len: int = 8,
+    fast: bool = False,
+    mixed_precision_p2p: str = 'fp32',
 ):
     *_, config = inspect.getargvalues(inspect.currentframe())
     # Handle the output folder creation
     if accelerator.is_main_process:
+        # now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+        # output_dir = os.path.join(output_dir, now)
         os.makedirs(output_dir, exist_ok=True)
         os.makedirs(f"{output_dir}/samples", exist_ok=True)
         os.makedirs(f"{output_dir}/inv_latents", exist_ok=True)
     accelerator.end_training()
+    torch.cuda.empty_cache()
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--config", type=str, default="./configs/tuneavideo.yaml")
+    parser.add_argument("--fast", action='store_true')
     args = parser.parse_args()
+    main(**OmegaConf.load(args.config), fast=args.fast)

Video-P2P/run_videop2p.py CHANGED Viewed

@@ -1,54 +1,113 @@
-# Adapted from https://github.com/google/prompt-to-prompt/blob/main/null_text_w_ptp.ipynb
 import os
 from typing import Optional, Union, Tuple, List, Callable, Dict
-from tqdm.notebook import tqdm
 import torch
-from diffusers import StableDiffusionPipeline, DDIMScheduler, AutoencoderKL
-import torch.nn.functional as nnf
-import numpy as np
 import abc
 import ptp_utils
 import seq_aligner
 import shutil
 from torch.optim.adam import Adam
 from PIL import Image
-from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer
-from einops import rearrange
-from tuneavideo.models.unet import UNet3DConditionModel
-from tuneavideo.pipelines.pipeline_tuneavideo import TuneAVideoPipeline
-import cv2
-import argparse
-from omegaconf import OmegaConf
-scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
-MY_TOKEN = ''
-LOW_RESOURCE = False
-NUM_DDIM_STEPS = 50
-GUIDANCE_SCALE = 7.5
-MAX_NUM_WORDS = 77
-device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
-# need to adjust sometimes
-mask_th = (.3, .3)
 def main(
     pretrained_model_path: str,
-    image_path: str,
-    prompt: str,
-    prompts: Tuple[str],
-    eq_params: Dict,
-    save_name: str,
-    is_word_swap: bool,
     blend_word: Tuple[str] = None,
     cross_replace_steps: float = 0.2,
     self_replace_steps: float = 0.5,
     video_len: int = 8,
     fast: bool = False,
-    mixed_precision: str = 'fp32',
 ):
     output_folder = os.path.join(pretrained_model_path, 'results')
     if fast:
         save_name_1 = os.path.join(output_folder, 'inversion_fast.gif')
@@ -63,9 +122,9 @@ def main(
     cross_replace_steps = {'default_': cross_replace_steps,}
     weight_dtype = torch.float32
-    if mixed_precision == "fp16":
         weight_dtype = torch.float16
-    elif mixed_precision == "bf16":
         weight_dtype = torch.bfloat16
     if not os.path.exists(output_folder):
@@ -106,8 +165,8 @@ def main(
             k = 1
             maps = (maps * alpha).sum(-1).mean(2)
             if use_pool:
-                maps = nnf.max_pool2d(maps, (k * 2 + 1, k * 2 +1), (1, 1), padding=(k, k))
-            mask = nnf.interpolate(maps, size=(x_t.shape[3:]))
             mask = mask / mask.max(2, keepdims=True)[0].max(3, keepdims=True)[0]
             mask = mask.gt(self.th[1-int(use_pool)])
             mask = mask[:1] + mask
@@ -385,33 +444,10 @@ def main(
     def load_512_seq(image_path, left=0, right=0, top=0, bottom=0, n_sample_frame=video_len, sampling_rate=1):
-        images = []
-        for file in sorted(os.listdir(image_path)):
-            images.append(file)
-        n_images = len(images)
-        sequence_length = (n_sample_frame - 1) * sampling_rate + 1
-        if n_images < sequence_length:
-            raise ValueError
-        frames = []
-        for index in range(n_sample_frame):
-            p = os.path.join(image_path, images[index])
-            image = np.array(Image.open(p).convert("RGB"))
-            h, w, c = image.shape
-            left = min(left, w-1)
-            right = min(right, w - left - 1)
-            top = min(top, h - left - 1)
-            bottom = min(bottom, h - top - 1)
-            image = image[top:h-bottom, left:w-right]
-            h, w, c = image.shape
-            if h < w:
-                offset = (w - h) // 2
-                image = image[:, offset:offset + h]
-            elif w < h:
-                offset = (h - w) // 2
-                image = image[offset:offset + w]
-            image = np.array(Image.fromarray(image).resize((512, 512)))
-            frames.append(image)
-        return np.stack(frames)
     class NullInversion:
@@ -544,7 +580,7 @@ def main(
             uncond_embeddings, cond_embeddings = self.context.chunk(2)
             uncond_embeddings_list = []
             latent_cur = latents[-1]
-            bar = tqdm(total=num_inner_steps * NUM_DDIM_STEPS)
             for i in range(NUM_DDIM_STEPS):
                 uncond_embeddings = uncond_embeddings.clone().detach()
                 uncond_embeddings.requires_grad = True
@@ -557,21 +593,21 @@ def main(
                     noise_pred_uncond = self.get_noise_pred_single(latent_cur, t, uncond_embeddings)
                     noise_pred = noise_pred_uncond + GUIDANCE_SCALE * (noise_pred_cond - noise_pred_uncond)
                     latents_prev_rec = self.prev_step(noise_pred, t, latent_cur)
-                    loss = nnf.mse_loss(latents_prev_rec, latent_prev)
                     optimizer.zero_grad()
                     loss.backward()
                     optimizer.step()
                     loss_item = loss.item()
-                    bar.update()
                     if loss_item < epsilon + i * 2e-5:
                         break
-                for j in range(j + 1, num_inner_steps):
-                    bar.update()
                 uncond_embeddings_list.append(uncond_embeddings[:1].detach())
                 with torch.no_grad():
                     context = torch.cat([uncond_embeddings, cond_embeddings])
                     latent_cur = self.get_noise_pred(latent_cur, t, False, context)
-            bar.close()
             return uncond_embeddings_list
         def invert(self, image_path: str, prompt: str, offsets=(0,0,0,0), num_inner_steps=10, early_stop_epsilon=1e-5, verbose=False):
@@ -652,12 +688,13 @@ def main(
         inversion.append( Image.fromarray((sequence1[i] * 255).numpy().astype(np.uint8)) )
         videop2p.append( Image.fromarray((sequence2[i] * 255).numpy().astype(np.uint8)) )
-    inversion[0].save(save_name_1, save_all=True, append_images=inversion[1:], optimize=False, loop=0, duration=250)
     videop2p[0].save(save_name_2, save_all=True, append_images=videop2p[1:], optimize=False, loop=0, duration=250)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--config", type=str, default="./configs/videop2p.yaml")
     parser.add_argument("--fast", action='store_true')
     args = parser.parse_args()

+import argparse
+import datetime
+import logging
+import inspect
+import math
 import os
 from typing import Optional, Union, Tuple, List, Callable, Dict
+from omegaconf import OmegaConf
 import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import diffusers
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from diffusers import AutoencoderKL, DDPMScheduler, DDIMScheduler
+from diffusers.optimization import get_scheduler
+from diffusers.utils import check_min_version
+from diffusers.utils.import_utils import is_xformers_available
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer
+from tuneavideo.models.unet import UNet3DConditionModel
+from tuneavideo.data.dataset import TuneAVideoDataset
+from tuneavideo.pipelines.pipeline_tuneavideo import TuneAVideoPipeline
+from tuneavideo.util import save_videos_grid, ddim_inversion
+from einops import rearrange
+import cv2
 import abc
 import ptp_utils
 import seq_aligner
 import shutil
 from torch.optim.adam import Adam
 from PIL import Image
+import numpy as np
+import decord
+decord.bridge.set_bridge('torch')
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.10.0.dev0")
+logger = get_logger(__name__, log_level="INFO")
 def main(
     pretrained_model_path: str,
+    output_dir: str,
+    train_data: Dict,
+    validation_data: Dict,
+    validation_steps: int = 100,
+    trainable_modules: Tuple[str] = (
+        "attn1.to_q",
+        "attn2.to_q",
+        "attn_temp",
+    ),
+    train_batch_size: int = 1,
+    max_train_steps: int = 500,
+    learning_rate: float = 3e-5,
+    scale_lr: bool = False,
+    lr_scheduler: str = "constant",
+    lr_warmup_steps: int = 0,
+    adam_beta1: float = 0.9,
+    adam_beta2: float = 0.999,
+    adam_weight_decay: float = 1e-2,
+    adam_epsilon: float = 1e-08,
+    max_grad_norm: float = 1.0,
+    gradient_accumulation_steps: int = 1,
+    gradient_checkpointing: bool = True,
+    checkpointing_steps: int = 500,
+    resume_from_checkpoint: Optional[str] = None,
+    mixed_precision: Optional[str] = "fp16",
+    use_8bit_adam: bool = False,
+    enable_xformers_memory_efficient_attention: bool = True,
+    seed: Optional[int] = None,
+    # pretrained_model_path: str,
+    # image_path: str = None,
+    # prompt: str = None,
+    prompts: Tuple[str] = None,
+    eq_params: Dict = None,
+    save_name: str = None,
+    is_word_swap: bool = None,
     blend_word: Tuple[str] = None,
     cross_replace_steps: float = 0.2,
     self_replace_steps: float = 0.5,
     video_len: int = 8,
     fast: bool = False,
+    mixed_precision_p2p: str = 'fp32',
 ):
+    # Video-P2P
+    scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
+    MY_TOKEN = ''
+    LOW_RESOURCE = False
+    NUM_DDIM_STEPS = 50
+    GUIDANCE_SCALE = 7.5
+    MAX_NUM_WORDS = 77
+    device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
+    # need to adjust sometimes
+    mask_th = (.3, .3)
+    pretrained_model_path = output_dir
+    image_path = train_data['video_path']
+    prompt = train_data['prompt']
+    # prompts = [prompt, ]
     output_folder = os.path.join(pretrained_model_path, 'results')
     if fast:
         save_name_1 = os.path.join(output_folder, 'inversion_fast.gif')
     cross_replace_steps = {'default_': cross_replace_steps,}
     weight_dtype = torch.float32
+    if mixed_precision_p2p == "fp16":
         weight_dtype = torch.float16
+    elif mixed_precision_p2p == "bf16":
         weight_dtype = torch.bfloat16
     if not os.path.exists(output_folder):
             k = 1
             maps = (maps * alpha).sum(-1).mean(2)
             if use_pool:
+                maps = F.max_pool2d(maps, (k * 2 + 1, k * 2 +1), (1, 1), padding=(k, k))
+            mask = F.interpolate(maps, size=(x_t.shape[3:]))
             mask = mask / mask.max(2, keepdims=True)[0].max(3, keepdims=True)[0]
             mask = mask.gt(self.th[1-int(use_pool)])
             mask = mask[:1] + mask
     def load_512_seq(image_path, left=0, right=0, top=0, bottom=0, n_sample_frame=video_len, sampling_rate=1):
+        vr = decord.VideoReader(image_path, width=512, height=512)
+        sample_index = list(range(0, len(vr), sampling_rate))[:n_sample_frame]
+        video = vr.get_batch(sample_index)
+        return video.numpy()
     class NullInversion:
             uncond_embeddings, cond_embeddings = self.context.chunk(2)
             uncond_embeddings_list = []
             latent_cur = latents[-1]
+            # bar = tqdm(total=num_inner_steps * NUM_DDIM_STEPS)
             for i in range(NUM_DDIM_STEPS):
                 uncond_embeddings = uncond_embeddings.clone().detach()
                 uncond_embeddings.requires_grad = True
                     noise_pred_uncond = self.get_noise_pred_single(latent_cur, t, uncond_embeddings)
                     noise_pred = noise_pred_uncond + GUIDANCE_SCALE * (noise_pred_cond - noise_pred_uncond)
                     latents_prev_rec = self.prev_step(noise_pred, t, latent_cur)
+                    loss = F.mse_loss(latents_prev_rec, latent_prev)
                     optimizer.zero_grad()
                     loss.backward()
                     optimizer.step()
                     loss_item = loss.item()
+                    # bar.update()
                     if loss_item < epsilon + i * 2e-5:
                         break
+                # for j in range(j + 1, num_inner_steps):
+                #     bar.update()
                 uncond_embeddings_list.append(uncond_embeddings[:1].detach())
                 with torch.no_grad():
                     context = torch.cat([uncond_embeddings, cond_embeddings])
                     latent_cur = self.get_noise_pred(latent_cur, t, False, context)
+            # bar.close()
             return uncond_embeddings_list
         def invert(self, image_path: str, prompt: str, offsets=(0,0,0,0), num_inner_steps=10, early_stop_epsilon=1e-5, verbose=False):
         inversion.append( Image.fromarray((sequence1[i] * 255).numpy().astype(np.uint8)) )
         videop2p.append( Image.fromarray((sequence2[i] * 255).numpy().astype(np.uint8)) )
+    # inversion[0].save(save_name_1, save_all=True, append_images=inversion[1:], optimize=False, loop=0, duration=250)
     videop2p[0].save(save_name_2, save_all=True, append_images=videop2p[1:], optimize=False, loop=0, duration=250)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default="./configs/tuneavideo.yaml")
     parser.add_argument("--fast", action='store_true')
     args = parser.parse_args()

trainer.py CHANGED Viewed

@@ -145,7 +145,9 @@ class Trainer:
         with open(config_path, 'w') as f:
             OmegaConf.save(config, f)
-        command = f'accelerate launch Video-P2P/run.py --config {config_path} --fast'
         subprocess.run(shlex.split(command))
         save_model_card(save_dir=output_dir,
                         base_model=base_model,

         with open(config_path, 'w') as f:
             OmegaConf.save(config, f)
+        command = f'accelerate launch Video-P2P/run_tuning.py --config {config_path}'
+        subprocess.run(shlex.split(command))
+        command = f'python Video-P2P/run_tuning.py --config {config_path} --fast'
         subprocess.run(shlex.split(command))
         save_model_card(save_dir=output_dir,
                         base_model=base_model,