[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ChenyangQiQi/FateZero/blob/main/colab_fatezero.ipynb)

# FateZero: Fusing Attentions for Zero-shot Text-based Video Editing

[Chenyang Qi](https://chenyangqiqi.github.io/), [Xiaodong Cun](http://vinthony.github.io/), [Yong Zhang](https://yzhang2016.github.io), [Chenyang Lei](https://chenyanglei.github.io/), [Xintao Wang](https://xinntao.github.io/), [Ying Shan](https://scholar.google.com/citations?hl=zh-CN&user=4oXBp9UAAAAJ), and [Qifeng Chen](https://cqf.io)


[![Project Website](https://img.shields.io/badge/Project-Website-orange)](https://fate-zero-edit.github.io/)
[![arXiv](https://img.shields.io/badge/arXiv-2303.09535-b31b1b.svg)](https://arxiv.org/abs/2303.09535)
[![GitHub](https://img.shields.io/github/stars/ChenyangQiQi/FateZero?style=social)](https://github.com/ChenyangQiQi/FateZero)

In [2]:
#@markdown Check type of GPU and VRAM available.
!nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader

Tesla T4, 15360 MiB, 15101 MiB


In [3]:
#@title Install requirements

!git clone https://github.com/ChenyangQiQi/FateZero /content/FateZero
%cd /content/FateZero
# %pip install -r requirements.txt
%pip install -q -U --pre triton
%pip install -q diffusers[torch]==0.11.1 transformers==4.26.0 bitsandbytes==0.35.4 \
decord accelerate omegaconf einops ftfy gradio imageio-ffmpeg xformers

Cloning into '/content/FateZero'...
remote: Enumerating objects: 332, done.[K
remote: Counting objects: 100% (53/53), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 332 (delta 50), reused 47 (delta 46), pack-reused 279[K
Receiving objects: 100% (332/332), 34.21 MiB | 14.26 MiB/s, done.
Resolving deltas: 100% (157/157), done.
/content/FateZero
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.3/63.3 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m145.0/145.0 KB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for lit (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m524.9/524.9 KB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m74.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━

In [4]:
#@title Download pretrained model

#@markdown Name/Path of the initial model.
MODEL_NAME = "CompVis/stable-diffusion-v1-4" #@param {type:"string"}

#@markdown If model should be download from a remote repo. Untick it if the model is loaded from a local path.
download_pretrained_model = True #@param {type:"boolean"}
if download_pretrained_model:
    !git lfs install
    !git clone https://huggingface.co/$MODEL_NAME ckpt/$MODEL_NAME
    MODEL_NAME = f"./ckpt/{MODEL_NAME}"
print(f"[*] MODEL_NAME={MODEL_NAME}")

Updated git hooks.
Git LFS initialized.
Cloning into 'ckpt/CompVis/stable-diffusion-v1-4'...
remote: Enumerating objects: 738, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 738 (delta 3), reused 1 (delta 0), pack-reused 726[K
Receiving objects: 100% (738/738), 682.52 KiB | 954.00 KiB/s, done.
Resolving deltas: 100% (123/123), done.
Filtering content: 100% (8/8), 10.20 GiB | 63.59 MiB/s, done.
[*] MODEL_NAME=./ckpt/CompVis/stable-diffusion-v1-4


# **Usage**


## FateZero Edit with low resource cost


In [16]:
#@markdown Edit config

#@markdown More details of the configuration will be given soon.

from omegaconf import OmegaConf

VIDEO_FILE = 'data/car-turn' #@param {type:"string"}

VIDEO_ID = VIDEO_FILE.split('/')[-1]

RESULT_DIR = 'result/'+VIDEO_ID

CONFIG_NAME = "config/"+VIDEO_ID+".yaml" 

source_prompt = "a silver jeep driving down a curvy road in the countryside" #@param {type:"string"}
edit_prompt = "watercolor painting of a silver jeep driving down a curvy road in the countryside"  #@param {type:"string"}
EMPHYSIS_WORD = "watercolor" #@param {type:"string"}
EMPHYSIS_VALUE = 10 #@param {type:"number"}
video_length = 8 #@param {type:"number"}
INVERSION_STEP = 8 #@param {type:"number"}
REPLACE_STRENGTH = 0.8 #@param {type:"slider", min:0, max:1, step:0.1}
STORE_ATTENTION_ON_disk = False #@param {type:"boolean"}
width = 512 
height = 512 

config = {
  "pretrained_model_path": MODEL_NAME,
  "logdir": RESULT_DIR,
  "train_dataset": {
    "path": VIDEO_FILE,
    "prompt": source_prompt,
    "n_sample_frame": video_length,
    "sampling_rate": 1,
    "stride": 80,
    "offset": 
    {
        "left": 0,
        "right": 0,
        "top": 0,
        "bottom": 0,
    }
  },
  "validation_sample_logger_config":{
      "use_train_latents": True,
      "use_inversion_attention": True,
      "guidance_scale": 7.5,
      "prompts":[
          source_prompt,
          edit_prompt,
      ],
      "p2p_config":[ 
          {
          "cross_replace_steps":{
              "default_":0.8
              },
          "self_replace_steps": 0.8,
          "masked_self_attention": True,
           "bend_th": [2, 2],
          "is_replace_controller": False 
          },
          {
          "cross_replace_steps":{
              "default_":0.8
              },
          "self_replace_steps": 0.8,
          "eq_params":{
              "words":[EMPHYSIS_WORD],
              "values": [EMPHYSIS_VALUE]
            },
          "use_inversion_attention": True,
          "is_replace_controller": False 
          }]
          ,
    "clip_length": "${..train_dataset.n_sample_frame}",
    "sample_seeds": [0],
    "num_inference_steps": INVERSION_STEP,
    "prompt2prompt_edit": True
     },
  "disk_store": STORE_ATTENTION_ON_disk,
  "model_config":{
      "lora": 160,
      "SparseCausalAttention_index": ['mid'],
      "least_sc_channel": 640
  },
  "test_pipeline_config":{
    "target": "video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline",
    "num_inference_steps": "${..validation_sample_logger.num_inference_steps}"
  },
  "epsilon": 1e-5,
  "train_steps": 10,
  "seed": 0,
  "learning_rate": 1e-5,
  "train_temporal_conv": False,
  "guidance_scale": "${validation_sample_logger_config.guidance_scale}"
}

OmegaConf.save(config, CONFIG_NAME)
print('save new configue to ', CONFIG_NAME)

save new configue to  config/car-turn.yaml


In [17]:
!accelerate launch test_fatezero.py --config=$CONFIG_NAME

2023-03-22 09:04:20.819710: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-22 09:04:24.565385: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
2023-03-22 09:04:24.565750: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib64-nvidia
The following values were not passed to `accelerate launch` and had defaults used instead:
	`-

### Show the results

In [18]:
from IPython.display import HTML
from base64 import b64encode
import os, sys
import glob

# get the last from results
mp4_name = sorted(glob.glob('./result/*/sample/step_0.mp4'))[-1]

print(mp4_name)
mp4 = open('{}'.format(mp4_name),'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()

print('Display animation: {}'.format(mp4_name), file=sys.stderr)
display(HTML("""
  <video width=512 controls>
        <source src="%s" type="video/mp4">
  </video>
  """ % data_url))

./result/car-turn_230322-090435/sample/step_0.mp4


Display animation: ./result/car-turn_230322-090435/sample/step_0.mp4


## Edit your video

In [None]:
#@markdown Upload your video(.mp4) by running this cell or skip this cell using the default data

import os
from google.colab import files
import shutil
from IPython.display import HTML
from base64 import b64encode

uploaded = files.upload()
for filename in uploaded.keys():
    dst_path = os.path.join("data", filename)
    shutil.move(filename, dst_path)
    
file_id = dst_path.replace('.mp4', '')

! mkdir -p $file_id
! ffmpeg -hide_banner -loglevel error -i $dst_path -vf scale="512:512" -vf fps=25 $file_id/%05d.png

mp4 = open('{}'.format(dst_path),'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()

display(HTML("""
  <video width=512 controls>
        <source src="%s" type="video/mp4">
  </video>
  """ % data_url))
