Spaces:

chenyangqi
/

FateZero

Runtime error

App Files Files Community

chenyangqi commited on Mar 28, 2023

Commit

3060b7e

1 Parent(s): 8094e3b

add FateZero code

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

FateZero +0 -1
FateZero/.gitignore +176 -0
FateZero/LICENSE.md +21 -0
FateZero/README.md +393 -0
FateZero/colab_fatezero.ipynb +0 -0
FateZero/config/.gitignore +1 -0
FateZero/config/attribute/bear_tiger_lion_leopard.yaml +108 -0
FateZero/config/attribute/bus_gpu.yaml +100 -0
FateZero/config/attribute/cat_tiger_leopard_grass.yaml +112 -0
FateZero/config/attribute/dog_robotic_corgi.yaml +103 -0
FateZero/config/attribute/duck_rubber.yaml +99 -0
FateZero/config/attribute/fox_wolf_snow.yaml +107 -0
FateZero/config/attribute/rabbit_straberry_leaves_flowers.yaml +114 -0
FateZero/config/attribute/squ_carrot_robot_eggplant.yaml +123 -0
FateZero/config/attribute/swan_swa.yaml +102 -0
FateZero/config/low_resource_teaser/jeep_watercolor_ddim_10_steps.yaml +83 -0
FateZero/config/low_resource_teaser/jeep_watercolor_ddim_10_steps_disk_store.yaml +84 -0
FateZero/config/style/jeep_watercolor.yaml +94 -0
FateZero/config/style/lily_monet.yaml +93 -0
FateZero/config/style/rabit_pokemon.yaml +92 -0
FateZero/config/style/sun_flower_van_gogh.yaml +86 -0
FateZero/config/style/surf_ukiyo.yaml +90 -0
FateZero/config/style/swan_cartoon.yaml +101 -0
FateZero/config/style/train_shinkai.yaml +97 -0
FateZero/config/teaser/jeep_posche.yaml +93 -0
FateZero/config/teaser/jeep_watercolor.yaml +94 -0
FateZero/data/.gitignore +4 -0
FateZero/data/teaser_car-turn/00000.png +0 -0
FateZero/data/teaser_car-turn/00001.png +0 -0
FateZero/data/teaser_car-turn/00002.png +0 -0
FateZero/data/teaser_car-turn/00003.png +0 -0
FateZero/data/teaser_car-turn/00004.png +0 -0
FateZero/data/teaser_car-turn/00005.png +0 -0
FateZero/data/teaser_car-turn/00006.png +0 -0
FateZero/data/teaser_car-turn/00007.png +0 -0
FateZero/docs/EditingGuidance.md +65 -0
FateZero/docs/OpenSans-Regular.ttf +0 -0
FateZero/requirements.txt +17 -0
FateZero/test_fatezero.py +290 -0
FateZero/test_fatezero_dataset.py +52 -0
FateZero/test_install.py +23 -0
FateZero/train_tune_a_video.py +426 -0
FateZero/video_diffusion/common/image_util.py +203 -0
FateZero/video_diffusion/common/instantiate_from_config.py +33 -0
FateZero/video_diffusion/common/logger.py +17 -0
FateZero/video_diffusion/common/set_seed.py +28 -0
FateZero/video_diffusion/common/util.py +73 -0
FateZero/video_diffusion/data/dataset.py +158 -0
FateZero/video_diffusion/data/transform.py +48 -0
FateZero/video_diffusion/models/attention.py +482 -0

FateZero DELETED Viewed

	@@ -1 +0,0 @@
1	- Subproject commit 6992d238770f464c03a0a74cbcec4f99da4635ec

FateZero/.gitignore ADDED Viewed

	@@ -0,0 +1,176 @@

+start_hold
+chenyangqi
+trash/**
+runs*/**
+result/**
+ckpt/**
+ckpt
+**.whl
+stable-diffusion-v1-4
+trash
+# data/**
+# Initially taken from Github's Python gitignore file
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# tests and logs
+tests/fixtures/cached_*_text.txt
+logs/
+lightning_logs/
+lang_code_data/
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# vscode
+.vs
+.vscode
+# Pycharm
+.idea
+# TF code
+tensorflow_code
+# Models
+proc_data
+# examples
+runs
+/runs_old
+/wandb
+/examples/runs
+/examples/**/*.args
+/examples/rag/sweep
+# emacs
+*.*~
+debug.env
+# vim
+.*.swp
+#ctags
+tags
+# pre-commit
+.pre-commit*
+# .lock
+*.lock
+# DS_Store (MacOS)
+.DS_Store
+# RL pipelines may produce mp4 outputs
+*.mp4

FateZero/LICENSE.md ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Chenyang QI
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

FateZero/README.md ADDED Viewed

	@@ -0,0 +1,393 @@

+## FateZero: Fusing Attentions for Zero-shot Text-based Video Editing
+[Chenyang Qi](https://chenyangqiqi.github.io/), [Xiaodong Cun](http://vinthony.github.io/), [Yong Zhang](https://yzhang2016.github.io), [Chenyang Lei](https://chenyanglei.github.io/), [Xintao Wang](https://xinntao.github.io/), [Ying Shan](https://scholar.google.com/citations?hl=zh-CN&user=4oXBp9UAAAAJ), and [Qifeng Chen](https://cqf.io)
+<a href='https://arxiv.org/abs/2303.09535'><img src='https://img.shields.io/badge/ArXiv-2303.09535-red'></a>
+<a href='https://fate-zero-edit.github.io/'><img src='https://img.shields.io/badge/Project-Page-Green'></a>  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ChenyangQiQi/FateZero/blob/main/colab_fatezero.ipynb)
+[![GitHub](https://img.shields.io/github/stars/ChenyangQiQi/FateZero?style=social)](https://github.com/ChenyangQiQi/FateZero)
+<!-- ![fatezero_demo](./docs/teaser.png) -->
+<table class="center">
+  <td><img src="docs/gif_results/17_car_posche_01_concat_result.gif"></td>
+  <td><img src="docs/gif_results/3_sunflower_vangogh_conat_result.gif"></td>
+  <tr>
+  <td width=25% style="text-align:center;">"silver jeep ➜ posche car"</td>
+  <td width=25% style="text-align:center;">"+ Van Gogh style"</td>
+  <!-- <td width=25% style="text-align:center;">"Wonder Woman, wearing a cowboy hat, is skiing"</td>
+  <td width=25% style="text-align:center;">"A man, wearing pink clothes, is skiing at sunset"</td> -->
+</tr>
+</table >
+## Abstract
+<b>TL;DR: Using FateZero, Edits your video via pretrained Diffusion models without training.</b>
+<details><summary>CLICK for full abstract</summary>
+> The diffusion-based generative models have achieved
+remarkable success in text-based image generation. However,
+since it contains enormous randomness in generation
+progress, it is still challenging to apply such models for
+real-world visual content editing, especially in videos. In
+this paper, we propose FateZero, a zero-shot text-based editing method on real-world videos without per-prompt
+training or use-specific mask. To edit videos consistently,
+we propose several techniques based on the pre-trained
+models. Firstly, in contrast to the straightforward DDIM
+inversion technique, our approach captures intermediate
+attention maps during inversion, which effectively retain
+both structural and motion information. These maps are
+directly fused in the editing process rather than generated
+during denoising. To further minimize semantic leakage of
+the source video, we then fuse self-attentions with a blending
+mask obtained by cross-attention features from the source
+prompt. Furthermore, we have implemented a reform of the
+self-attention mechanism in denoising UNet by introducing
+spatial-temporal attention to ensure frame consistency. Yet
+succinct, our method is the first one to show the ability of
+zero-shot text-driven video style and local attribute editing
+from the trained text-to-image model. We also have a better
+zero-shot shape-aware editing ability based on the text-tovideo
+model. Extensive experiments demonstrate our
+superior temporal consistency and editing capability than
+previous works.
+</details>
+## Changelog
+- 2023.03.27 Release [`attribute editing config`](config/attribute) and
+  <!-- [`data`](https://hkustconnect-my.sharepoint.com/:u:/g/personal/cqiaa_connect_ust_hk/Ee7J2IzZuaVGkefh-ZRp1GwB7RCUYU7MVJCKqeNWmOIpfg?e=dcOwb7) -->
+  [`data`](https://github.com/ChenyangQiQi/FateZero/releases/download/v0.0.1/attribute.zip) used in the paper.
+- 2023.03.22 Upload a `colab notebook` [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ChenyangQiQi/FateZero/blob/main/colab_fatezero.ipynb). Enjoy the fun of zero-shot video-editing freely!
+- 2023.03.22 Release [`style editing config`](config/style) and
+  <!--[`data`](https://hkustconnect-my.sharepoint.com/:u:/g/personal/cqiaa_connect_ust_hk/EaTqRAuW0eJLj0z_JJrURkcBZCC3Zvgsdo6zsXHhpyHhHQ?e=FzuiNG) -->
+  [`data`](https://github.com/ChenyangQiQi/FateZero/releases/download/v0.0.1/style.zip)
+  used in the paper.
+- 2023.03.21 [Editing guidance](docs/EditingGuidance.md) is provided to help users to edit in-the-wild video. Welcome to play and give feedback!
+- 2023.03.21 Update the `codebase and configuration`. Now, it can run with lower resources (16G GPU and less than 16G CPU RAM) with [new configuration](config/low_resource_teaser) in `config/low_resource_teaser`.
+<!-- A new option store all the attentions in hard disk, which require less ram. -->
+- 2023.03.17 Release Code and Paper!
+## Todo
+- [x] Release the edit config for teaser
+- [x] Memory and runtime profiling
+- [x] Hands-on guidance of hyperparameters tuning
+- [x] Colab
+- [x] Release configs for other result and in-the-wild dataset
+  <!-- - [x] Style editing: done
+  - [-] Attribute editing: in progress -->
+- [-] hugging-face: inprogress
+- [ ] Tune-a-video optimization and shape editing configs
+- [ ] Release more application
+## Setup Environment
+Our method is tested using cuda11, fp16 of accelerator and xformers on a single A100 or 3090.
+```bash
+conda create -n fatezero38 python=3.8
+conda activate fatezero38
+pip install -r requirements.txt
+```
+`xformers` is recommended for A100 GPU to save memory and running time.
+<details><summary>Click for xformers installation </summary>
+We find its installation not stable. You may try the following wheel:
+```bash
+wget https://github.com/ShivamShrirao/xformers-wheels/releases/download/4c06c79/xformers-0.0.15.dev0+4c06c79.d20221201-cp38-cp38-linux_x86_64.whl
+pip install xformers-0.0.15.dev0+4c06c79.d20221201-cp38-cp38-linux_x86_64.whl
+```
+</details>
+Validate the installation by
+```
+python test_install.py
+```
+Our environment is similar to Tune-A-video ([official](https://github.com/showlab/Tune-A-Video), [unofficial](https://github.com/bryandlee/Tune-A-Video))  and [prompt-to-prompt](https://github.com/google/prompt-to-prompt/). You may check them for more details.
+## FateZero Editing
+#### Style and Attribute Editing in Teaser
+Download the [stable diffusion v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4) (or other interesting image diffusion model) and put it to `./ckpt/stable-diffusion-v1-4`.
+<details><summary>Click for bash command: </summary>
+```
+mkdir ./ckpt
+# download from huggingface face, takes 20G space
+git lfs install
+git clone https://huggingface.co/CompVis/stable-diffusion-v1-4
+cd ./ckpt
+ln -s ../stable-diffusion-v1-4 .
+```
+</details>
+Then, you could reproduce style and shape editing result in our teaser by running:
+```bash
+accelerate launch test_fatezero.py --config config/teaser/jeep_watercolor.yaml
+# or CUDA_VISIBLE_DEVICES=0 python test_fatezero.py --config config/teaser/jeep_watercolor.yaml
+```
+<details><summary>The result is saved at `./result` . (Click for directory structure) </summary>
+```
+result
+├── teaser
+│   ├── jeep_posche
+│   ├── jeep_watercolor
+│           ├── cross-attention  # visualization of cross-attention during inversion
+│           ├── sample           # result
+│           ├── train_samples    # the input video
+```
+</details>
+Editing 8 frames on an Nvidia 3090, use `100G CPU memory, 12G GPU memory` for editing. We also provide some [`low cost setting`](config/low_resource_teaser) of style editing by different hyper-parameters on a 16GB GPU.
+You may try these low cost setting on colab.
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ChenyangQiQi/FateZero/blob/main/colab_fatezero.ipynb)
+More the speed and hardware benchmark [here](docs/EditingGuidance.md#ddim-hyperparameters).
+#### Shape and large motion editing with Tune-A-Video
+Besides style and attribution editing above, we also provide a `Tune-A-Video` [checkpoint](https://hkustconnect-my.sharepoint.com/:f:/g/personal/cqiaa_connect_ust_hk/EviSTWoAOs1EmHtqZruq50kBZu1E8gxDknCPigSvsS96uQ?e=492khj). You may download the it and move it to `./ckpt/jeep_tuned_200/`.
+<!-- We provide the [Tune-a-Video](https://drive.google.com/file/d/166eNbabM6TeJVy7hxol2gL1kUGKHi3Do/view?usp=share_link), you could download the data, unzip and put it to `data`. : -->
+<details><summary>The directory structure should like this: (Click for directory structure) </summary>
+```
+ckpt
+├── stable-diffusion-v1-4
+├── jeep_tuned_200
+...
+data
+├── car-turn
+│   ├── 00000000.png
+│   ├── 00000001.png
+│   ├── ...
+video_diffusion
+```
+</details>
+You could reproduce the shape editing result in our teaser by running:
+```bash
+accelerate launch test_fatezero.py --config config/teaser/jeep_posche.yaml
+```
+### Reproduce other results in the paper (in progress)
+<!-- Download the data of [style editing](https://hkustconnect-my.sharepoint.com/:u:/g/personal/cqiaa_connect_ust_hk/EaTqRAuW0eJLj0z_JJrURkcBZCC3Zvgsdo6zsXHhpyHhHQ?e=FzuiNG) and [attribute editing](https://hkustconnect-my.sharepoint.com/:u:/g/personal/cqiaa_connect_ust_hk/Ee7J2IzZuaVGkefh-ZRp1GwB7RCUYU7MVJCKqeNWmOIpfg?e=dcOwb7)
+-->
+Download the data of style editing and attribute editing
+from [onedrive](https://hkustconnect-my.sharepoint.com/:f:/g/personal/cqiaa_connect_ust_hk/EkIeHj3CQiBNhm6iEEhJQZwBEBJNCGt3FsANmyqeAYbuXQ?e=FxYtJk) or from Github [Release](https://github.com/ChenyangQiQi/FateZero/releases/tag/v0.0.1).
+<details><summary>Click for wget bash command: </summary>
+```
+wget https://github.com/ChenyangQiQi/FateZero/releases/download/v0.0.1/attribute.zip
+wget https://github.com/ChenyangQiQi/FateZero/releases/download/v0.0.1/style.zip
+```
+</details>
+Unzip and Place it in ['./data'](data). Then use the command in ['config/style'](config/style) and ['config/attribute'](config/attribute) to get the results.
+The config of our tune-a-video ckpts will be updated latter.
+## Tuning guidance to edit YOUR video
+We provided a tuning guidance to edit in-the-wild video at [here](./docs/EditingGuidance.md). The work is still in progress. Welcome to give your feedback in issues.
+## Style Editing Results with Stable Diffusion
+We show the difference of source prompt and target prompt in the box below each video.
+Note mp4 and gif files in this github page are compressed.
+Please check our [Project Page](https://fate-zero-edit.github.io/) for mp4 files of original video editing results.
+<table class="center">
+<tr>
+  <td><img src="docs/gif_results/style/1_surf_ukiyo_01_concat_result.gif"></td>
+  <td><img src="docs/gif_results/style/2_car_watercolor_01_concat_result.gif"></td>
+    <td><img src="docs/gif_results/style/6_lily_monet_01_concat_result.gif"></td>
+  <!-- <td><img src="https://tuneavideo.github.io/assets/results/tuneavideo/man-skiing/wonder-woman.gif"></td>
+  <td><img src="https://tuneavideo.github.io/assets/results/tuneavideo/man-skiing/pink-sunset.gif"></td> -->
+</tr>
+<tr>
+  <td width=25% style="text-align:center;">"+ Ukiyo-e style"</td>
+  <td width=25% style="text-align:center;">"+ watercolor painting"</td>
+  <td width=25% style="text-align:center;">"+ Monet style"</td>
+</tr>
+<tr>
+  <td><img src="docs/gif_results/style/4_rabit_pokemon_01_concat_result.gif"></td>
+  <td><img src="docs/gif_results/style/5_train_shikai_01_concat_result.gif"></td>
+  <td><img src="docs/gif_results/style/7_swan_carton_01_concat_result.gif"></td>
+</tr>
+<tr>
+</tr>
+<tr>
+  <td width=25% style="text-align:center;">"+ Pokémon cartoon style"</td>
+  <td width=25% style="text-align:center;">"+ Makoto Shinkai style"</td>
+  <td width=25% style="text-align:center;">"+ cartoon style"</td>
+</tr>
+</table>
+## Attribute Editing Results with Stable Diffusion
+<table class="center">
+<tr>
+  <td><img src="docs/gif_results/attri/15_rabbit_eat_01_concat_result.gif"></td>
+  <td><img src="docs/gif_results/attri/15_rabbit_eat_02_concat_result.gif"></td>
+  <td><img src="docs/gif_results/attri/15_rabbit_eat_04_concat_result.gif"></td>
+</tr>
+<tr>
+  <td width=25% style="text-align:center;">"rabbit, strawberry ➜ white rabbit, flower"</td>
+  <td width=25% style="text-align:center;">"rabbit, strawberry ➜ squirrel, carrot"</td>
+  <td width=25% style="text-align:center;">"rabbit, strawberry ➜ white rabbit, leaves"</td>
+</tr>
+<tr>
+  <td><img src="docs/gif_results/attri/16_sq_eat_04_concat_result.gif"></td>
+  <td><img src="docs/gif_results/attri/16_sq_eat_02_concat_result.gif"></td>
+  <td><img src="docs/gif_results/attri/16_sq_eat_03_concat_result.gif"></td>
+</tr>
+<tr>
+  <td width=25% style="text-align:center;">"squirrel ➜ robot squirrel"</td>
+  <td width=25% style="text-align:center;">"squirrel, Carrot ➜ rabbit, eggplant"</td>
+  <td width=25% style="text-align:center;">"squirrel, Carrot ➜ robot mouse, screwdriver"</td>
+</tr>
+<tr>
+  <td><img src="docs/gif_results/attri/13_bear_tiger_leopard_lion_01_concat_result.gif"></td>
+  <td><img src="docs/gif_results/attri/13_bear_tiger_leopard_lion_02_concat_result.gif"></td>
+  <td><img src="docs/gif_results/attri/13_bear_tiger_leopard_lion_03_concat_result.gif"></td>
+</tr>
+<tr>
+  <td width=25% style="text-align:center;">"bear ➜ a red tiger"</td>
+  <td width=25% style="text-align:center;">"bear ➜ a yellow leopard"</td>
+  <td width=25% style="text-align:center;">"bear ➜ a brown lion"</td>
+</tr>
+<tr>
+  <td><img src="docs/gif_results/attri/14_cat_grass_tiger_corgin_02_concat_result.gif"></td>
+  <td><img src="docs/gif_results/attri/14_cat_grass_tiger_corgin_03_concat_result.gif"></td>
+  <td><img src="docs/gif_results/attri/14_cat_grass_tiger_corgin_04_concat_result.gif"></td>
+</tr>
+<tr>
+  <td width=25% style="text-align:center;">"cat ➜ black cat, grass..."</td>
+  <td width=25% style="text-align:center;">"cat ➜ red tiger"</td>
+  <td width=25% style="text-align:center;">"cat ➜ Shiba-Inu"</td>
+</tr>
+<tr>
+  <td><img src="docs/gif_results/attri/10_bus_gpu_01_concat_result.gif"></td>
+  <td><img src="docs/gif_results/attri/11_dog_robotic_corgin_01_concat_result.gif"></td>
+  <td><img src="docs/gif_results/attri/11_dog_robotic_corgin_02_concat_result.gif"></td>
+</tr>
+<tr>
+  <td width=25% style="text-align:center;">"bus ➜ GPU"</td>
+  <td width=25% style="text-align:center;">"gray dog ➜ yellow corgi"</td>
+  <td width=25% style="text-align:center;">"gray dog ➜ robotic dog"</td>
+</tr>
+<tr>
+  <td><img src="docs/gif_results/attri/9_duck_rubber_01_concat_result.gif"></td>
+  <td><img src="docs/gif_results/attri/12_fox_snow_wolf_01_concat_result.gif"></td>
+  <td><img src="docs/gif_results/attri/12_fox_snow_wolf_02_concat_result.gif"></td>
+</tr>
+<tr>
+  <td width=25% style="text-align:center;">"white duck ➜ yellow rubber duck"</td>
+  <td width=25% style="text-align:center;">"grass ➜ snow"</td>
+  <td width=25% style="text-align:center;">"white fox ➜ grey wolf"</td>
+</tr>
+</table>
+## Shape and large motion editing with Tune-A-Video
+<table class="center">
+<tr>
+  <td><img src="docs/gif_results/shape/17_car_posche_01_concat_result.gif"></td>
+  <td><img src="docs/gif_results/shape/18_swan_01_concat_result.gif"></td>
+    <td><img src="docs/gif_results/shape/18_swan_02_concat_result.gif"></td>
+  <!-- <td><img src="https://tuneavideo.github.io/assets/results/tuneavideo/man-skiing/wonder-woman.gif"></td>
+  <td><img src="https://tuneavideo.github.io/assets/results/tuneavideo/man-skiing/pink-sunset.gif"></td> -->
+</tr>
+<tr>
+  <td width=25% style="text-align:center;">"silver jeep ➜ posche car"</td>
+  <td width=25% style="text-align:center;">"Swan ➜ White Duck"</td>
+  <td width=25% style="text-align:center;">"Swan ➜ Pink flamingo"</td>
+</tr>
+<tr>
+  <td><img src="docs/gif_results/shape/19_man_wonder_01_concat_result.gif"></td>
+  <td><img src="docs/gif_results/shape/19_man_wonder_02_concat_result.gif"></td>
+  <td><img src="docs/gif_results/shape/19_man_wonder_03_concat_result.gif"></td>
+</tr>
+<tr>
+</tr>
+<tr>
+  <td width=25% style="text-align:center;">"A man ➜ A Batman"</td>
+  <td width=25% style="text-align:center;">"A man ➜ A Wonder Woman, With cowboy hat"</td>
+  <td width=25% style="text-align:center;">"A man ➜ A Spider-Man"</td>
+</tr>
+</table>
+## Demo Video
+https://user-images.githubusercontent.com/45789244/225698509-79c14793-3153-4bba-9d6e-ede7d811d7f8.mp4
+The video here is compressed due to the size limit of github.
+The original full resolution video is [here](https://hkustconnect-my.sharepoint.com/:v:/g/personal/cqiaa_connect_ust_hk/EXKDI_nahEhKtiYPvvyU9SkBDTG2W4G1AZ_vkC7ekh3ENw?e=Xhgtmk).
+## Citation
+```
+@misc{qi2023fatezero,
+      title={FateZero: Fusing Attentions for Zero-shot Text-based Video Editing},
+      author={Chenyang Qi and Xiaodong Cun and Yong Zhang and Chenyang Lei and Xintao Wang and Ying Shan and Qifeng Chen},
+      year={2023},
+      eprint={2303.09535},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+## Acknowledgements
+This repository borrows heavily from [Tune-A-Video](https://github.com/showlab/Tune-A-Video) and [prompt-to-prompt](https://github.com/google/prompt-to-prompt/). thanks the authors for sharing their code and models.
+## Maintenance
+This is the codebase for our research work. We are still working hard to update this repo and more details are coming in days. If you have any questions or ideas to discuss, feel free to contact [Chenyang Qi](cqiaa@connect.ust.hk) or [Xiaodong Cun](vinthony@gmail.com).

FateZero/colab_fatezero.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

FateZero/config/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ # debug/**

FateZero/config/attribute/bear_tiger_lion_leopard.yaml ADDED Viewed

	@@ -0,0 +1,108 @@

+# CUDA_VISIBLE_DEVICES=0 python test_fatezero.py --config config/attribute/bear_tiger_lion_leopard.yaml
+pretrained_model_path: "./ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "data/attribute/bear_tiger_lion_leopard"
+    prompt: "a brown bear walking on the rock against a wall"
+    n_sample_frame: 8
+    # n_sample_frame: 22
+    sampling_rate: 1
+    stride: 80
+    offset:
+        left: 0
+        right: 0
+        top: 0
+        bottom: 0
+validation_sample_logger_config:
+    use_train_latents: True
+    use_inversion_attention: True
+    guidance_scale: 7.5
+    prompts: [
+        # source prompt
+        a brown bear walking on the rock against a wall,
+        # foreground texture style
+        a red tiger walking on the rock against a wall,
+        a yellow leopard walking on the rock against a wall,
+        a brown lion walking on the rock against a wall,
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic preserving and replacement Debug me
+            cross_replace_steps:
+                default_: 0.8
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.6
+            # Amplify the target-words cross attention, larger value, more close to target
+            eq_params:
+                words: ["silver", "sculpture"]
+                values: [2,2]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            blend_words: [['cat',], ["cat",]]
+            masked_self_attention:  True
+            # masked_latents: False   # performance not so good in our case, need debug
+            bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # preserve all source self-attention
+            # bend_th : [0.0, 0.0], mask -> 1, use more att_replace, more generated attention, less source acttention
+        1:
+            is_replace_controller: true
+            cross_replace_steps:
+                default_: 0.7
+            self_replace_steps: 0.7
+        2:
+            is_replace_controller: true
+            cross_replace_steps:
+                default_: 0.7
+            self_replace_steps: 0.7
+        3:
+            is_replace_controller: true
+            cross_replace_steps:
+                default_: 0.7
+            self_replace_steps: 0.7
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    val_all_frames: False
+    num_inference_steps: 50
+    prompt2prompt_edit: True
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    SparseCausalAttention_index: ['mid']
+    least_sc_channel: 640
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 10
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/attribute/bus_gpu.yaml ADDED Viewed

	@@ -0,0 +1,100 @@

+# CUDA_VISIBLE_DEVICES=0 python test_fatezero.py --config config/attribute/bus_gpu.yaml
+pretrained_model_path: "./ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "data/attribute/bus_gpu"
+    prompt: "a white and blue bus on the road"
+    n_sample_frame: 8
+    # n_sample_frame: 22
+    sampling_rate: 1
+    stride: 80
+    offset:
+        left: 0
+        right: 0
+        top: 0
+        bottom: 0
+validation_sample_logger_config:
+    use_train_latents: True
+    use_inversion_attention: True
+    guidance_scale: 7.5
+    prompts: [
+        # source prompt
+        a white and blue bus on the road,
+        # foreground texture style
+        a black and green GPU on the road
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic preserving and replacement Debug me
+            cross_replace_steps:
+                default_: 0.8
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.6
+            # Amplify the target-words cross attention, larger value, more close to target
+            eq_params:
+                words: ["silver", "sculpture"]
+                values: [2,2]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            blend_words: [['cat',], ["cat",]]
+            masked_self_attention:  True
+            # masked_latents: False   # performance not so good in our case, need debug
+            bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # preserve all source self-attention
+            # bend_th : [0.0, 0.0], mask -> 1, use more att_replace, more generated attention, less source acttention
+        1:
+            is_replace_controller: true
+            cross_replace_steps:
+                default_: 0.1
+            self_replace_steps: 0.1
+            eq_params:
+                words: ["Nvidia", "GPU"]
+                values: [10, 10] # amplify attention to the word "tiger" by *2
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    val_all_frames: False
+    num_inference_steps: 50
+    prompt2prompt_edit: True
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    SparseCausalAttention_index: ['mid']
+    least_sc_channel: 640
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 10
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/attribute/cat_tiger_leopard_grass.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+# CUDA_VISIBLE_DEVICES=0 python test_fatezero.py --config config/attribute/cat_tiger_leopard_grass.yaml
+pretrained_model_path: "./ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "data/attribute/cat_tiger_leopard_grass"
+    prompt: "A black cat walking on the floor next to a wall"
+    n_sample_frame: 8
+    # n_sample_frame: 22
+    sampling_rate: 1
+    stride: 80
+    offset:
+        left: 0
+        right: 0
+        top: 0
+        bottom: 0
+validation_sample_logger_config:
+    use_train_latents: True
+    use_inversion_attention: True
+    guidance_scale: 7.5
+    prompts: [
+        # source prompt
+        A black cat walking on the floor next to a wall,
+        A black cat walking on the grass next to a wall,
+        A red tiger walking on the floor next to a wall,
+        a yellow cute Shiba-Inu walking on the floor next to a wall,
+        a yellow cute leopard walking on the floor next to a wall,
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic preserving and replacement Debug me
+            cross_replace_steps:
+                default_: 0.8
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.6
+            # Amplify the target-words cross attention, larger value, more close to target
+            eq_params:
+                words: ["silver", "sculpture"]
+                values: [2,2]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            blend_words: [['cat',], ["cat",]]
+            masked_self_attention:  True
+            # masked_latents: False   # performance not so good in our case, need debug
+            bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # preserve all source self-attention
+            # bend_th : [0.0, 0.0], mask -> 1, use more att_replace, more generated attention, less source acttention
+        1:
+            is_replace_controller: false
+            cross_replace_steps:
+                default_: 0.5
+            self_replace_steps: 0.5
+        2:
+            is_replace_controller: false
+            cross_replace_steps:
+                default_: 0.5
+            self_replace_steps: 0.5
+        3:
+            is_replace_controller: false
+            cross_replace_steps:
+                default_: 0.5
+            self_replace_steps: 0.5
+        4:
+            is_replace_controller: false
+            cross_replace_steps:
+                default_: 0.7
+            self_replace_steps: 0.7
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    val_all_frames: False
+    num_inference_steps: 50
+    prompt2prompt_edit: True
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    SparseCausalAttention_index: ['mid']
+    least_sc_channel: 640
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 10
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/attribute/dog_robotic_corgi.yaml ADDED Viewed

	@@ -0,0 +1,103 @@

+# CUDA_VISIBLE_DEVICES=0 python test_fatezero.py --config config/attribute/dog_robotic_corgi.yaml
+pretrained_model_path: "./ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "data/attribute/gray_dog"
+    prompt: "A gray dog sitting on the mat"
+    n_sample_frame: 8
+    # n_sample_frame: 22
+    sampling_rate: 1
+    stride: 80
+    offset:
+        left: 0
+        right: 0
+        top: 0
+        bottom: 0
+validation_sample_logger_config:
+    use_train_latents: True
+    use_inversion_attention: True
+    guidance_scale: 7.5
+    prompts: [
+        # source prompt
+        A gray dog sitting on the mat,
+        # foreground texture style
+        A robotic dog sitting on the mat,
+        A yellow corgi sitting on the mat
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic preserving and replacement Debug me
+            cross_replace_steps:
+                default_: 0.8
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.6
+            # Amplify the target-words cross attention, larger value, more close to target
+            eq_params:
+                words: ["silver", "sculpture"]
+                values: [2,2]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            blend_words: [['cat',], ["cat",]]
+            masked_self_attention:  True
+            # masked_latents: False   # performance not so good in our case, need debug
+            bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # preserve all source self-attention
+            # bend_th : [0.0, 0.0], mask -> 1, use more att_replace, more generated attention, less source acttention
+        1:
+            is_replace_controller: false
+            cross_replace_steps:
+                default_: 0.5
+            self_replace_steps: 0.5
+            eq_params:
+                words: ["robotic"]
+                values: [10] # amplify attention to the word "tiger" by *2
+        2:
+            is_replace_controller: false
+            cross_replace_steps:
+                default_: 0.5
+            self_replace_steps: 0.5
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    val_all_frames: False
+    num_inference_steps: 50
+    prompt2prompt_edit: True
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    SparseCausalAttention_index: ['mid']
+    least_sc_channel: 640
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 10
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/attribute/duck_rubber.yaml ADDED Viewed

	@@ -0,0 +1,99 @@

+# CUDA_VISIBLE_DEVICES=0 python test_fatezero.py --config config/attribute/duck_rubber.yaml
+pretrained_model_path: "./ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "data/attribute/duck_rubber"
+    prompt: "a sleepy white duck"
+    n_sample_frame: 8
+    # n_sample_frame: 22
+    sampling_rate: 1
+    stride: 80
+    offset:
+        left: 0
+        right: 0
+        top: 0
+        bottom: 0
+validation_sample_logger_config:
+    use_train_latents: True
+    use_inversion_attention: True
+    guidance_scale: 7.5
+    prompts: [
+        # source prompt
+        a sleepy white duck,
+        # foreground texture style
+        a sleepy yellow rubber duck
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic preserving and replacement Debug me
+            cross_replace_steps:
+                default_: 0.8
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.6
+            # Amplify the target-words cross attention, larger value, more close to target
+            eq_params:
+                words: ["silver", "sculpture"]
+                values: [2,2]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            blend_words: [['cat',], ["cat",]]
+            masked_self_attention:  True
+            # masked_latents: False   # performance not so good in our case, need debug
+            bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # preserve all source self-attention
+            # bend_th : [0.0, 0.0], mask -> 1, use more att_replace, more generated attention, less source acttention
+        1:
+            is_replace_controller: False
+            cross_replace_steps:
+                default_: 0.7
+            self_replace_steps: 0.7
+            # eq_params:
+            #     words: ["yellow", "rubber"]
+            #     values: [10, 10] # amplify attention to the word "tiger" by *2
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    val_all_frames: False
+    num_inference_steps: 50
+    prompt2prompt_edit: True
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    SparseCausalAttention_index: ['mid']
+    least_sc_channel: 640
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 10
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/attribute/fox_wolf_snow.yaml ADDED Viewed

	@@ -0,0 +1,107 @@

+# CUDA_VISIBLE_DEVICES=0 python test_fatezero.py --config config/attribute/fox_wolf_snow.yaml
+pretrained_model_path: "./ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "data/attribute/fox_wolf_snow"
+    prompt: "a white fox sitting in the grass"
+    n_sample_frame: 8
+    # n_sample_frame: 22
+    sampling_rate: 1
+    stride: 80
+    offset:
+        left: 0
+        right: 0
+        top: 0
+        bottom: 0
+validation_sample_logger_config:
+    use_train_latents: True
+    use_inversion_attention: True
+    guidance_scale: 7.5
+    prompts: [
+        # source prompt
+        a white fox sitting in the grass,
+        # foreground texture style
+        a grey wolf sitting in the grass,
+        a white fox sitting in the snow
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic preserving and replacement Debug me
+            cross_replace_steps:
+                default_: 0.8
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.6
+            # Amplify the target-words cross attention, larger value, more close to target
+            eq_params:
+                words: ["silver", "sculpture"]
+                values: [2,2]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            blend_words: [['cat',], ["cat",]]
+            masked_self_attention:  True
+            # masked_latents: False   # performance not so good in our case, need debug
+            bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # preserve all source self-attention
+            # bend_th : [0.0, 0.0], mask -> 1, use more att_replace, more generated attention, less source acttention
+        1:
+            is_replace_controller: false
+            cross_replace_steps:
+                default_: 0.5
+            self_replace_steps: 0.5
+            eq_params:
+                words: ["robotic"]
+                values: [10] # amplify attention to the word "tiger" by *2
+        2:
+            is_replace_controller: false
+            cross_replace_steps:
+                default_: 0.5
+            self_replace_steps: 0.5
+            eq_params:
+                words: ["snow"]
+                values: [10] # amplify attention to the word "tiger" by *2
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    val_all_frames: False
+    num_inference_steps: 50
+    prompt2prompt_edit: True
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    SparseCausalAttention_index: ['mid']
+    least_sc_channel: 640
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 10
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/attribute/rabbit_straberry_leaves_flowers.yaml ADDED Viewed

	@@ -0,0 +1,114 @@

+# CUDA_VISIBLE_DEVICES=1 python test_fatezero.py --config config/attribute/rabbit_straberry_leaves_flowers.yaml
+pretrained_model_path: "./ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "data/attribute/rabbit_strawberry"
+    prompt: "A rabbit is eating strawberries"
+    n_sample_frame: 8
+    # n_sample_frame: 22
+    sampling_rate: 1
+    stride: 80
+    offset:
+        left: 0
+        right: 0
+        top: 0
+        bottom: 0
+validation_sample_logger_config:
+    use_train_latents: True
+    use_inversion_attention: True
+    guidance_scale: 7.5
+    prompts: [
+        # source prompt
+        A rabbit is eating strawberries,
+        # foreground texture style
+        A white rabbit is eating leaves,
+        A white rabbit is eating flower,
+        A white rabbit is eating orange,
+        # a brown lion walking on the rock against a wall,
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic preserving and replacement Debug me
+            cross_replace_steps:
+                default_: 0.8
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.6
+            # Amplify the target-words cross attention, larger value, more close to target
+            eq_params:
+                words: ["silver", "sculpture"]
+                values: [2,2]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            blend_words: [['cat',], ["cat",]]
+            masked_self_attention:  True
+            # masked_latents: False   # performance not so good in our case, need debug
+            bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # preserve all source self-attention
+            # bend_th : [0.0, 0.0], mask -> 1, use more att_replace, more generated attention, less source acttention
+        1:
+            is_replace_controller: false
+            cross_replace_steps:
+                default_: 0.5
+            self_replace_steps: 0.5
+            eq_params:
+                words: ["leaves"]
+                values: [10]
+        2:
+            is_replace_controller: false
+            cross_replace_steps:
+                default_: 0.5
+            self_replace_steps: 0.5
+            eq_params:
+                words: ["flower"]
+                values: [10]
+        3:
+            is_replace_controller: false
+            cross_replace_steps:
+                default_: 0.5
+            self_replace_steps: 0.5
+            eq_params:
+                words: ["orange"]
+                values: [10]
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    val_all_frames: False
+    num_inference_steps: 50
+    prompt2prompt_edit: True
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    SparseCausalAttention_index: ['mid']
+    least_sc_channel: 640
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 10
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/attribute/squ_carrot_robot_eggplant.yaml ADDED Viewed

	@@ -0,0 +1,123 @@

+# CUDA_VISIBLE_DEVICES=0 python test_fatezero.py --config config/attribute/squ_carrot_robot_eggplant.yaml
+pretrained_model_path: "./ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "data/attribute/squirrel_carrot"
+    prompt: "A squirrel is eating a carrot"
+    n_sample_frame: 8
+    # n_sample_frame: 22
+    sampling_rate: 1
+    stride: 80
+    offset:
+        left: 0
+        right: 0
+        top: 0
+        bottom: 0
+validation_sample_logger_config:
+    use_train_latents: True
+    use_inversion_attention: True
+    guidance_scale: 7.5
+    prompts: [
+        # source prompt
+        A squirrel is eating a carrot,
+        A robot squirrel is eating a carrot,
+        A rabbit is eating a eggplant,
+        A robot mouse is eating a screwdriver,
+        A white mouse is eating a peanut,
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic preserving and replacement Debug me
+            cross_replace_steps:
+                default_: 0.8
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.6
+            # Amplify the target-words cross attention, larger value, more close to target
+            eq_params:
+                words: ["silver", "sculpture"]
+                values: [2,2]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            blend_words: [['cat',], ["cat",]]
+            masked_self_attention:  True
+            # masked_latents: False   # performance not so good in our case, need debug
+            bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # preserve all source self-attention
+            # bend_th : [0.0, 0.0], mask -> 1, use more att_replace, more generated attention, less source acttention
+        1:
+            is_replace_controller: false
+            cross_replace_steps:
+                default_: 0.5
+            self_replace_steps: 0.4
+            eq_params:
+                words:  ["rabbit", "mouse", "robot", "eggplant", "peanut", "screwdriver"]
+                values: [10,        10,     20,         10,         10,     10]
+        2:
+            is_replace_controller: false
+            cross_replace_steps:
+                default_: 0.5
+            self_replace_steps: 0.5
+            eq_params:
+                words: ["rabbit", "mouse", "robot", "eggplant", "peanut", "screwdriver"]
+                values: [10,        10,     20,         10,         10,     10]
+        3:
+            is_replace_controller: false
+            cross_replace_steps:
+                default_: 0.5
+            self_replace_steps: 0.5
+            eq_params:
+                words: ["rabbit", "mouse", "robot", "eggplant", "peanut", "screwdriver"]
+                values: [10,        10,     20,         10,         10,     10]
+        4:
+            is_replace_controller: false
+            cross_replace_steps:
+                default_: 0.5
+            self_replace_steps: 0.5
+            eq_params:
+                words: ["rabbit", "mouse", "robot", "eggplant", "peanut", "screwdriver"]
+                values: [10,        10,     20,         10,         10,     10]
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    val_all_frames: False
+    num_inference_steps: 50
+    prompt2prompt_edit: True
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    SparseCausalAttention_index: ['mid']
+    least_sc_channel: 640
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 10
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/attribute/swan_swa.yaml ADDED Viewed

	@@ -0,0 +1,102 @@

+# CUDA_VISIBLE_DEVICES=0 python test_fatezero.py --config config/attribute/swan_swa.yaml
+pretrained_model_path: "./ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "data/attribute/swan_swarov"
+    prompt: "a black swan with a red beak swimming in a river near a wall and bushes,"
+    n_sample_frame: 8
+    # n_sample_frame: 22
+    sampling_rate: 1
+    stride: 80
+    offset:
+        left: 0
+        right: 0
+        top: 0
+        bottom: 0
+use_train_latents: True
+validation_sample_logger_config:
+    use_train_latents: True
+    use_inversion_attention: True
+    guidance_scale: 7.5
+    prompts: [
+        # source prompt
+        a black swan with a red beak swimming in a river near a wall and bushes,
+        # foreground texture style
+        a Swarovski crystal swan with a red beak swimming in a river near a wall and bushes,
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic preserving and replacement Debug me
+            cross_replace_steps:
+                default_: 0.8
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.6
+            # Amplify the target-words cross attention, larger value, more close to target
+            eq_params:
+                words: ["silver", "sculpture"]
+                values: [2,2]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            blend_words: [['cat',], ["cat",]]
+            masked_self_attention:  True
+            # masked_latents: False   # performance not so good in our case, need debug
+            bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # preserve all source self-attention
+            # bend_th : [0.0, 0.0], mask -> 1, use more att_replace, more generated attention, less source acttention
+        1:
+            is_replace_controller: False
+            cross_replace_steps:
+                default_: 0.8
+            self_replace_steps: 0.6
+            eq_params:
+                words: ["Swarovski", "crystal"]
+                values: [5, 5] # amplify attention to the word "tiger" by *2
+            use_inversion_attention: True
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    val_all_frames: False
+    num_inference_steps: 50
+    prompt2prompt_edit: True
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    SparseCausalAttention_index: ['mid']
+    least_sc_channel: 1280
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 10
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/low_resource_teaser/jeep_watercolor_ddim_10_steps.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+# CUDA_VISIBLE_DEVICES=0 python test_fatezero.py --config config/low_resource_teaser/jeep_watercolor.yaml
+pretrained_model_path: "FateZero/ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "FateZero/data/teaser_car-turn"
+    prompt: "a silver jeep driving down a curvy road in the countryside"
+    n_sample_frame: 8
+    sampling_rate: 1
+    stride: 80
+    offset:
+        left: 0
+        right: 0
+        top: 0
+        bottom: 0
+validation_sample_logger_config:
+    use_train_latents: true
+    use_inversion_attention: true
+    guidance_scale: 7.5
+    source_prompt: "${train_dataset.prompt}"
+    prompts: [
+        # a silver jeep driving down a curvy road in the countryside,
+        watercolor painting of a silver jeep driving down a curvy road in the countryside,
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic layout preserving. High steps, replace more cross attention to preserve semantic layout
+            cross_replace_steps:
+                default_: 0.8
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.8
+            # Amplify the target-words cross attention, larger value, more close to target
+            eq_params:
+                words: ["watercolor"]
+                values: [10,10]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            # blend_words: [['jeep',], ["car",]]
+            # masked_self_attention:  True
+            # masked_latents: False   # performance not so good in our case, need debug
+            # bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # replace full-resolution edit source with self-attention
+            # bend_th-> [0.0, 0.0], mask -> 1, use more edit self-attention, more generated shape, less source acttention
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    num_inference_steps: 10
+    prompt2prompt_edit: True
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    SparseCausalAttention_index: ['mid']
+    least_sc_channel: 640
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 10
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/low_resource_teaser/jeep_watercolor_ddim_10_steps_disk_store.yaml ADDED Viewed

	@@ -0,0 +1,84 @@

+# CUDA_VISIBLE_DEVICES=0 python test_fatezero.py --config config/low_resource_teaser/jeep_watercolor_ddim_10_steps_disk_store.yaml
+pretrained_model_path: "./ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "data/teaser_car-turn"
+    prompt: "a silver jeep driving down a curvy road in the countryside"
+    n_sample_frame: 8
+    sampling_rate: 1
+    stride: 80
+    offset:
+        left: 0
+        right: 0
+        top: 0
+        bottom: 0
+validation_sample_logger_config:
+    use_train_latents: true
+    use_inversion_attention: true
+    guidance_scale: 7.5
+    source_prompt: "${train_dataset.prompt}"
+    prompts: [
+        # a silver jeep driving down a curvy road in the countryside,
+        watercolor painting of a silver jeep driving down a curvy road in the countryside,
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic layout preserving. High steps, replace more cross attention to preserve semantic layout
+            cross_replace_steps:
+                default_: 0.8
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.8
+            # Amplify the target-words cross attention, larger value, more close to target
+            eq_params:
+                words: ["watercolor"]
+                values: [10,10]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            # blend_words: [['jeep',], ["car",]]
+            # masked_self_attention:  True
+            # masked_latents: False   # performance not so good in our case, need debug
+            # bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # replace full-resolution edit source with self-attention
+            # bend_th-> [0.0, 0.0], mask -> 1, use more edit self-attention, more generated shape, less source acttention
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    num_inference_steps: 10
+    prompt2prompt_edit: True
+disk_store: True
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    SparseCausalAttention_index: ['mid']
+    least_sc_channel: 640
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 10
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/style/jeep_watercolor.yaml ADDED Viewed

	@@ -0,0 +1,94 @@

+# CUDA_VISIBLE_DEVICES=0 python test_fatezero.py --config config/teaser/jeep_watercolor.yaml
+pretrained_model_path: "./ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "data/teaser_car-turn"
+    prompt: "a silver jeep driving down a curvy road in the countryside"
+    n_sample_frame: 8
+    sampling_rate: 1
+    stride: 80
+    offset:
+        left: 0
+        right: 0
+        top: 0
+        bottom: 0
+validation_sample_logger_config:
+    use_train_latents: true
+    use_inversion_attention: true
+    guidance_scale: 7.5
+    prompts: [
+        a silver jeep driving down a curvy road in the countryside,
+        watercolor painting of a silver jeep driving down a curvy road in the countryside,
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic layout preserving. High steps, replace more cross attention to preserve semantic layout
+            cross_replace_steps:
+                default_: 0.8
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.9
+            # Amplify the target-words cross attention, larger value, more close to target
+            # eq_params:
+            #     words: ["", ""]
+            #     values: [10,10]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            # blend_words: [['jeep',], ["car",]]
+            masked_self_attention:  True
+            # masked_latents: False   #  Directly copy the latents, performance not so good in our case
+            bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # replace full-resolution edit source with self-attention
+            # bend_th-> [0.0, 0.0], mask -> 1, use more edit self-attention, more generated shape, less source acttention
+        1:
+            cross_replace_steps:
+                default_: 0.8
+            self_replace_steps: 0.8
+            eq_params:
+                words: ["watercolor"]
+                values: [10] # amplify attention to the word "tiger" by *2
+            use_inversion_attention: True
+            is_replace_controller: False
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    num_inference_steps: 50
+    prompt2prompt_edit: True
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    SparseCausalAttention_index: ['mid']
+    least_sc_channel: 640
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 10
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/style/lily_monet.yaml ADDED Viewed

	@@ -0,0 +1,93 @@

+pretrained_model_path: "./ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "data/style/red_water_lily_opening"
+    prompt: "a pink water lily"
+    start_sample_frame: 1
+    n_sample_frame: 8
+    # n_sample_frame: 22
+    sampling_rate: 20
+    stride: 8000
+    # offset:
+    #     left: 300
+    #     right: 0
+    #     top: 0
+    #     bottom: 0
+validation_sample_logger_config:
+    use_train_latents: True
+    use_inversion_attention: True
+    guidance_scale: 7.5
+    prompts: [
+        a pink water lily,
+        Claude Monet painting of a pink water lily,
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic preserving and replacement Debug me
+            cross_replace_steps:
+                default_: 0.7
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.7
+            # Amplify the target-words cross attention, larger value, more close to target
+            eq_params:
+                words: ["silver", "sculpture"]
+                values: [2,2]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            blend_words: [['cat',], ["cat",]]
+            masked_self_attention:  True
+            # masked_latents: False   # performance not so good in our case, need debug
+            bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # preserve all source self-attention
+            # bend_th : [0.0, 0.0], mask -> 1, use more att_replace, more generated attention, less source acttention
+        1:
+            is_replace_controller: False
+            cross_replace_steps:
+                default_: 0.5
+            self_replace_steps: 0.5
+            eq_params:
+                words: ["Monet"]
+                values: [10]
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    val_all_frames: False
+    num_inference_steps: 50
+    prompt2prompt_edit: True
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    SparseCausalAttention_index: ['mid']
+    least_sc_channel: 1280
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 10
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/style/rabit_pokemon.yaml ADDED Viewed

	@@ -0,0 +1,92 @@

+pretrained_model_path: "./ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "data/style/rabit"
+    prompt: "A rabbit is eating a watermelon"
+    n_sample_frame: 8
+    # n_sample_frame: 22
+    sampling_rate: 3
+    stride: 80
+validation_sample_logger_config:
+    use_train_latents: True
+    use_inversion_attention: True
+    guidance_scale: 7.5
+    prompts: [
+        # source prompt
+        A rabbit is eating a watermelon,
+        # overall style
+        pokemon cartoon of A rabbit is eating a watermelon,
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic preserving and replacement Debug me
+            cross_replace_steps:
+                default_: 0.8
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.6
+            # Amplify the target-words cross attention, larger value, more close to target
+            eq_params:
+                words: ["silver", "sculpture"]
+                values: [2,2]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            blend_words: [['cat',], ["cat",]]
+            masked_self_attention:  True
+            # masked_latents: False   # performance not so good in our case, need debug
+            bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # preserve all source self-attention
+            # bend_th : [0.0, 0.0], mask -> 1, use more att_replace, more generated attention, less source acttention
+        1:
+            is_replace_controller: False
+            cross_replace_steps:
+                default_: 0.7
+            self_replace_steps: 0.7
+            eq_params:
+                words: ["pokemon", "cartoon"]
+                values: [3, 3] # amplify attention to the word "tiger" by *2
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    val_all_frames: False
+    num_inference_steps: 50
+    prompt2prompt_edit: True
+model_config:
+    # lora: 160
+    # temporal_downsample_time: 4
+    # SparseCausalAttention_index: ['mid']
+    # least_sc_channel: 640
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 50
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/style/sun_flower_van_gogh.yaml ADDED Viewed

	@@ -0,0 +1,86 @@

+pretrained_model_path: "./ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "data/style/sunflower"
+    prompt: "a yellow sunflower"
+    start_sample_frame: 0
+    n_sample_frame: 8
+    sampling_rate: 1
+validation_sample_logger_config:
+    use_train_latents: True
+    use_inversion_attention: True
+    guidance_scale: 7.5
+    prompts: [
+        a yellow sunflower,
+        van gogh style painting of a yellow sunflower,
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic preserving and replacement Debug me
+            cross_replace_steps:
+                default_: 0.7
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.7
+            # Amplify the target-words cross attention, larger value, more close to target
+            eq_params:
+                words: ["silver", "sculpture"]
+                values: [2,2]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            blend_words: [['cat',], ["cat",]]
+            masked_self_attention:  True
+            # masked_latents: False   # performance not so good in our case, need debug
+            bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # preserve all source self-attention
+            # bend_th : [0.0, 0.0], mask -> 1, use more att_replace, more generated attention, less source acttention
+        1:
+            is_replace_controller: False
+            cross_replace_steps:
+                default_: 0.5
+            self_replace_steps: 0.5
+            eq_params:
+                words: ["van", "gogh"]
+                values: [10, 10] # amplify attention to the word "tiger" by *2
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    val_all_frames: False
+    num_inference_steps: 50
+    prompt2prompt_edit: True
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    SparseCausalAttention_index: ['mid']
+    least_sc_channel: 640
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 10
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/style/surf_ukiyo.yaml ADDED Viewed

	@@ -0,0 +1,90 @@

+pretrained_model_path: "./ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "data/style/surf"
+    prompt: "a man with round helmet surfing on a white wave in blue ocean with a rope"
+    n_sample_frame: 1
+    sampling_rate: 8
+# use_train_latents: True
+validation_sample_logger_config:
+    use_train_latents: true
+    use_inversion_attention: true
+    guidance_scale: 7.5
+    prompts: [
+        a man with round helmet surfing on a white wave in blue ocean with a rope,
+        The Ukiyo-e style painting of a man with round helmet surfing on a white wave in blue ocean with a rope
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic preserving and replacement Debug me
+            cross_replace_steps:
+                default_: 0.8
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.8
+            # Amplify the target-words cross attention, larger value, more close to target
+            eq_params:
+                words: ["silver", "sculpture"]
+                values: [2,2]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            blend_words: [['cat',], ["cat",]]
+            masked_self_attention:  True
+            # masked_latents: False   # performance not so good in our case, need debug
+            bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # preserve all source self-attention
+            # bend_th : [0.0, 0.0], mask -> 1, use more att_replace, more generated attention, less source acttention
+        1:
+            is_replace_controller: False
+            cross_replace_steps:
+                default_: 0.9
+            self_replace_steps: 0.9
+            eq_params:
+                words: ["Ukiyo-e"]
+                values: [10, 10] # amplify attention to the word "tiger" by *2
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    val_all_frames: False
+    num_inference_steps: 50
+    prompt2prompt_edit: True
+model_config:
+    # lora: 160
+    # temporal_downsample_time: 4
+    SparseCausalAttention_index: ['mid']
+    least_sc_channel: 640
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 50
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/style/swan_cartoon.yaml ADDED Viewed

	@@ -0,0 +1,101 @@

+pretrained_model_path: "./ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "data/style/blackswan"
+    prompt: "a black swan with a red beak swimming in a river near a wall and bushes,"
+    n_sample_frame: 8
+    # n_sample_frame: 22
+    sampling_rate: 6
+    stride: 80
+    offset:
+        left: 0
+        right: 0
+        top: 0
+        bottom: 0
+# use_train_latents: True
+validation_sample_logger_config:
+    use_train_latents: true
+    use_inversion_attention: true
+    guidance_scale: 7.5
+    prompts: [
+        # source prompt
+        a black swan with a red beak swimming in a river near a wall and bushes,
+        cartoon photo of a black swan with a red beak swimming in a river near a wall and bushes,
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic preserving and replacement Debug me
+            cross_replace_steps:
+                default_: 0.8
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.6
+            # Amplify the target-words cross attention, larger value, more close to target
+            eq_params:
+                words: ["silver", "sculpture"]
+                values: [2,2]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            blend_words: [['cat',], ["cat",]]
+            masked_self_attention:  True
+            # masked_latents: False   # performance not so good in our case, need debug
+            bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # preserve all source self-attention
+            # bend_th : [0.0, 0.0], mask -> 1, use more att_replace, more generated attention, less source acttention
+            # Fixed hyperparams
+            use_inversion_attention: True
+        1:
+            is_replace_controller: False
+            cross_replace_steps:
+                default_: 0.8
+            self_replace_steps: 0.7
+            eq_params:
+                words: ["cartoon"]
+                values: [10] # amplify attention to the word "tiger" by *2
+            use_inversion_attention: True
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    val_all_frames: False
+    num_inference_steps: 50
+    # guidance_scale: 7.5
+    prompt2prompt_edit: True
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    SparseCausalAttention_index: ['mid']
+    least_sc_channel: 640
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 10
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/style/train_shinkai.yaml ADDED Viewed

	@@ -0,0 +1,97 @@

+pretrained_model_path: "./ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "data/style/train"
+    prompt: "a train traveling down tracks next to a forest filled with trees and flowers and a man on the side of the track"
+    n_sample_frame: 32
+    # n_sample_frame: 22
+    sampling_rate: 7
+    stride: 80
+    # offset:
+    #     left: 300
+    #     right: 0
+    #     top: 0
+    #     bottom: 0
+use_train_latents: True
+validation_sample_logger_config:
+    use_train_latents: True
+    use_inversion_attention: True
+    guidance_scale: 7.5
+    prompts: [
+        a train traveling down tracks next to a forest filled with trees and flowers and a man on the side of the track,
+        a train traveling down tracks next to a forest filled with trees and flowers and a man on the side of the track Makoto Shinkai style
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic preserving and replacement Debug me
+            cross_replace_steps:
+                default_: 1.0
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 1.0
+            # Amplify the target-words cross attention, larger value, more close to target
+            # eq_params:
+            #     words: ["silver", "sculpture"]
+            #     values: [2,2]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            # blend_words: [['cat',], ["cat",]]
+            # masked_self_attention:  True
+            # # masked_latents: False   # performance not so good in our case, need debug
+            # bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # preserve all source self-attention
+            # bend_th : [0.0, 0.0], mask -> 1, use more att_replace, more generated attention, less source acttention
+        1:
+            is_replace_controller: False
+            cross_replace_steps:
+                default_: 1.0
+            self_replace_steps: 0.9
+            eq_params:
+                words: ["Makoto", "Shinkai"]
+                values: [10, 10] # amplify attention to the word "tiger" by *2
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    val_all_frames: False
+    num_inference_steps: 50
+    prompt2prompt_edit: True
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    SparseCausalAttention_index: ['mid']
+    least_sc_channel: 1280
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 10
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/teaser/jeep_posche.yaml ADDED Viewed

	@@ -0,0 +1,93 @@

+# CUDA_VISIBLE_DEVICES=0 python test_fatezero.py --config config/teaser/jeep_posche.yaml
+pretrained_model_path: "./ckpt/jeep_tuned_200"
+train_dataset:
+    path: "data/teaser_car-turn"
+    prompt: "a silver jeep driving down a curvy road in the countryside,"
+    n_sample_frame: 8
+    sampling_rate: 1
+    stride: 80
+    offset:
+        left: 0
+        right: 0
+        top: 0
+        bottom: 0
+validation_sample_logger_config:
+    use_train_latents: true
+    use_inversion_attention: true
+    guidance_scale: 7.5
+    prompts: [
+        a silver jeep driving down a curvy road in the countryside,
+        a Porsche car driving down a curvy road in the countryside,
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic layout preserving. High steps, replace more cross attention to preserve semantic layout
+            cross_replace_steps:
+                default_: 0.8
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.9
+            # Amplify the target-words cross attention, larger value, more close to target
+            # Usefull in style editing
+            eq_params:
+                words: ["watercolor", "painting"]
+                values: [10,10]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            # Usefull in shape editing
+            blend_words: [['jeep',], ["car",]]
+            masked_self_attention:  True
+            # masked_latents: False   #  Directly copy the latents, performance not so good in our case
+            # preserve source structure of blend_words , [0, 1]
+            # bend_th-> [1.0, 1.0], mask -> 0, use inversion-time attention, the structure is similar to the input
+            # bend_th-> [0.0, 0.0], mask -> 1, use more edit self-attention, more generated shape, less source acttention
+            bend_th: [0.3, 0.3]
+        1:
+            cross_replace_steps:
+                default_: 0.5
+            self_replace_steps: 0.5
+            use_inversion_attention: True
+            is_replace_controller: True
+            blend_words: [['silver', 'jeep'], ["Porsche", 'car']] # for local edit. If it is not local yet - use only the source object: blend_word = ((('cat',), ("cat",))).
+            masked_self_attention: True
+            bend_th: [0.3, 0.3]
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    num_inference_steps: 50
+    prompt2prompt_edit: True
+model_config:
+    lora: 160
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 10
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/config/teaser/jeep_watercolor.yaml ADDED Viewed

	@@ -0,0 +1,94 @@

+# CUDA_VISIBLE_DEVICES=0 python test_fatezero.py --config config/teaser/jeep_watercolor.yaml
+pretrained_model_path: "FateZero/ckpt/stable-diffusion-v1-4"
+train_dataset:
+    path: "FateZero/data/teaser_car-turn"
+    prompt: "a silver jeep driving down a curvy road in the countryside"
+    n_sample_frame: 8
+    sampling_rate: 1
+    stride: 80
+    offset:
+        left: 0
+        right: 0
+        top: 0
+        bottom: 0
+validation_sample_logger_config:
+    use_train_latents: true
+    use_inversion_attention: true
+    guidance_scale: 7.5
+    prompts: [
+        a silver jeep driving down a curvy road in the countryside,
+        watercolor painting of a silver jeep driving down a curvy road in the countryside,
+    ]
+    p2p_config:
+        0:
+            # Whether to directly copy the cross attention from source
+            # True: directly copy, better for object replacement
+            # False: keep source attention, better for style
+            is_replace_controller: False
+            # Semantic layout preserving. High steps, replace more cross attention to preserve semantic layout
+            cross_replace_steps:
+                default_: 0.8
+            # Source background structure preserving, in [0, 1].
+            # e.g., =0.6 Replace the first 60% steps self-attention
+            self_replace_steps: 0.9
+            # Amplify the target-words cross attention, larger value, more close to target
+            # eq_params:
+            #     words: ["", ""]
+            #     values: [10,10]
+            # Target structure-divergence hyperparames
+            # If you change the shape of object better to use all three line, otherwise, no need.
+            # Without following three lines, all self-attention will be replaced
+            # blend_words: [['jeep',], ["car",]]
+            masked_self_attention:  True
+            # masked_latents: False   #  Directly copy the latents, performance not so good in our case
+            bend_th: [2, 2]
+            # preserve source structure of blend_words , [0, 1]
+            # default is bend_th: [2, 2]  # replace full-resolution edit source with self-attention
+            # bend_th-> [0.0, 0.0], mask -> 1, use more edit self-attention, more generated shape, less source acttention
+        1:
+            cross_replace_steps:
+                default_: 0.8
+            self_replace_steps: 0.8
+            eq_params:
+                words: ["watercolor"]
+                values: [10] # amplify attention to the word "tiger" by *2
+            use_inversion_attention: True
+            is_replace_controller: False
+    clip_length: "${..train_dataset.n_sample_frame}"
+    sample_seeds: [0]
+    num_inference_steps: 50
+    prompt2prompt_edit: True
+model_config:
+    lora: 160
+    # temporal_downsample_time: 4
+    SparseCausalAttention_index: ['mid']
+    least_sc_channel: 640
+    # least_sc_channel: 100000
+test_pipeline_config:
+    target: video_diffusion.pipelines.p2pDDIMSpatioTemporalPipeline.p2pDDIMSpatioTemporalPipeline
+    num_inference_steps: "${..validation_sample_logger.num_inference_steps}"
+epsilon: 1e-5
+train_steps: 10
+seed: 0
+learning_rate: 1e-5
+train_temporal_conv: False
+guidance_scale: "${validation_sample_logger_config.guidance_scale}"

FateZero/data/.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+*
+!teaser_car-turn
+!teaser_car-turn/*
+!.gitignore

FateZero/data/teaser_car-turn/00000.png ADDED Viewed

FateZero/data/teaser_car-turn/00001.png ADDED Viewed

FateZero/data/teaser_car-turn/00002.png ADDED Viewed

FateZero/data/teaser_car-turn/00003.png ADDED Viewed

FateZero/data/teaser_car-turn/00004.png ADDED Viewed

FateZero/data/teaser_car-turn/00005.png ADDED Viewed

FateZero/data/teaser_car-turn/00006.png ADDED Viewed

FateZero/data/teaser_car-turn/00007.png ADDED Viewed

FateZero/docs/EditingGuidance.md ADDED Viewed

	@@ -0,0 +1,65 @@

+# EditingGuidance
+## Prompt Engineering
+For the results in the paper and webpage, we get the source prompt using the BLIP model embedded in the [Stable Diffusion WebUI](https://github.com/AUTOMATIC1111/stable-diffusion-webui/).
+Click the "interrogate CLIP", and we will get a source prompt automatically. Then, we remove the last few useless words.
+<img src="../docs/blip.png" height="220px"/>
+During stylization, you may use a very simple source prompt "A photo" as a baseline if your input video is too complicated to describe by one sentence.
+### Validate the prompt
+- Put the source prompt into the stable diffusion. If the generated image is close to our input video, it can be a good source prompt.
+- A good prompt describes each frame and most objects in video. Especially, it has the object or attribute that we want to edit or preserve.
+- Put the target prompt into the stable diffusion. We can check the upper bound of our editing effect. A reasonable composition of video may achieve better results(e.g., "sunflower" video with "Van Gogh" prompt is better than "sunflower" with "Monet")
+## FateZero hyperparameters
+We give a simple analysis of the involved hyperparaters as follows:
+``` yaml
+# Whether to directly copy the cross attention from source
+# True: directly copy, better for object replacement
+# False: keep source attention, better for style
+is_replace_controller: False
+# Semantic layout preserving. High steps, replace more cross attention to preserve semantic layout
+cross_replace_steps:
+    default_: 0.8
+# Source background structure preserving, in [0, 1].
+# e.g., =0.6 Replace the first 60% steps self-attention
+self_replace_steps: 0.8
+# Amplify the target-words cross attention, larger value, more close to target
+# eq_params:
+#     words: ["", ""]
+#     values: [10,10]
+# Target structure-divergence hyperparames
+# If you change the shape of object, it is better to use all three line; otherwise, no need.
+# Without following three lines, all self-attention will be replaced
+blend_words: [['jeep',], ["car",]]
+masked_self_attention:  True
+# masked_latents: False   # Directly copy the latents, performance not so good in our case
+bend_th: [2, 2]
+# preserve source structure of blend_words in [0, 1]
+# default is bend_th: [2, 2]  # replace full-resolution edit source with self-attention
+# bend_th-> [0.0, 0.0], mask -> 1, use more edit self-attention, more generated shape, less source acttention
+```
+## DDIM hyperparameters
+We profile the cost of editing 8 frames on an Nvidia 3090, fp16 of accelerator, xformers.
+| Configs | Attention location | DDIM Inver. Step | CPU memory         | GPU memory        | Inversion time | Editing time time | Quality
+|------------------|------------------  |------------------|------------------|------------------|------------------|----| ---- |
+| [basic](../config/teaser/jeep_watercolor.yaml)  | RAM | 50  | 100G    | 12G  | 60s | 40s | Full support
+| [low cost](../config/low_resource_teaser/jeep_watercolor_ddim_10_steps.yaml) | RAM | 10  | 15G    | 12G  | 10s | 10s | OK for Style, not work for shape
+| [lower cost](../config/low_resource_teaser/jeep_watercolor_ddim_10_steps_disk_store.yaml) | DISK | 10  | 6G    | 12G  | 33 s | 100s | OK for Style, not work for shape

FateZero/docs/OpenSans-Regular.ttf ADDED Viewed

Binary file (148 kB). View file

FateZero/requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+--extra-index-url https://download.pytorch.org/whl/cu113
+torch==1.12.1+cu113 # --index-url https://download.pytorch.org/whl/cu113
+torchvision==0.13.1+cu113 # --index-url https://download.pytorch.org/whl/cu113
+diffusers[torch]==0.11.1
+accelerate==0.15.0
+transformers==4.25.1
+bitsandbytes==0.35.4
+einops
+omegaconf
+ftfy
+tensorboard
+modelcards
+imageio
+triton
+click
+opencv-python
+imageio[ffmpeg]

FateZero/test_fatezero.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import os
+from glob import glob
+import copy
+from typing import Optional,Dict
+from tqdm.auto import tqdm
+from omegaconf import OmegaConf
+import click
+import torch
+import torch.utils.data
+import torch.utils.checkpoint
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+)
+from diffusers.utils.import_utils import is_xformers_available
+from transformers import AutoTokenizer, CLIPTextModel
+from einops import rearrange
+import sys
+sys.path.append('FateZero')
+from video_diffusion.models.unet_3d_condition import UNetPseudo3DConditionModel
+from video_diffusion.data.dataset import ImageSequenceDataset
+from video_diffusion.common.util import get_time_string, get_function_args
+from video_diffusion.common.image_util import log_train_samples
+from video_diffusion.common.instantiate_from_config import instantiate_from_config
+from video_diffusion.pipelines.p2pvalidation_loop import p2pSampleLogger
+logger = get_logger(__name__)
+def collate_fn(examples):
+    """Concat a batch of sampled image in dataloader
+    """
+    batch = {
+        "prompt_ids": torch.cat([example["prompt_ids"] for example in examples], dim=0),
+        "images": torch.stack([example["images"] for example in examples]),
+    }
+    return batch
+def test(
+    config: str,
+    pretrained_model_path: str,
+    train_dataset: Dict,
+    logdir: str = None,
+    validation_sample_logger_config: Optional[Dict] = None,
+    test_pipeline_config: Optional[Dict] = None,
+    gradient_accumulation_steps: int = 1,
+    seed: Optional[int] = None,
+    mixed_precision: Optional[str] = "fp16",
+    train_batch_size: int = 1,
+    model_config: dict={},
+    verbose: bool=True,
+    **kwargs
+):
+    args = get_function_args()
+    time_string = get_time_string()
+    if logdir is None:
+        logdir = config.replace('config', 'result').replace('.yml', '').replace('.yaml', '')
+    logdir += f"_{time_string}"
+    accelerator = Accelerator(
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        mixed_precision=mixed_precision,
+    )
+    if accelerator.is_main_process:
+        os.makedirs(logdir, exist_ok=True)
+        OmegaConf.save(args, os.path.join(logdir, "config.yml"))
+    if seed is not None:
+        set_seed(seed)
+    # Load the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        pretrained_model_path,
+        subfolder="tokenizer",
+        use_fast=False,
+    )
+    # Load models and create wrapper for stable diffusion
+    text_encoder = CLIPTextModel.from_pretrained(
+        pretrained_model_path,
+        subfolder="text_encoder",
+    )
+    vae = AutoencoderKL.from_pretrained(
+        pretrained_model_path,
+        subfolder="vae",
+    )
+    unet = UNetPseudo3DConditionModel.from_2d_model(
+        os.path.join(pretrained_model_path, "unet"), model_config=model_config
+    )
+    if 'target' not in test_pipeline_config:
+        test_pipeline_config['target'] = 'video_diffusion.pipelines.stable_diffusion.SpatioTemporalStableDiffusionPipeline'
+    pipeline = instantiate_from_config(
+        test_pipeline_config,
+        vae=vae,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        unet=unet,
+        scheduler=DDIMScheduler.from_pretrained(
+            pretrained_model_path,
+            subfolder="scheduler",
+        ),
+        disk_store=kwargs.get('disk_store', False)
+    )
+    pipeline.scheduler.set_timesteps(validation_sample_logger_config['num_inference_steps'])
+    pipeline.set_progress_bar_config(disable=True)
+    if is_xformers_available():
+        try:
+            pipeline.enable_xformers_memory_efficient_attention()
+        except Exception as e:
+            logger.warning(
+                "Could not enable memory efficient attention. Make sure xformers is installed"
+                f" correctly and a GPU is available: {e}"
+            )
+    vae.requires_grad_(False)
+    unet.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    prompt_ids = tokenizer(
+        train_dataset["prompt"],
+        truncation=True,
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        return_tensors="pt",
+    ).input_ids
+    train_dataset = ImageSequenceDataset(**train_dataset, prompt_ids=prompt_ids)
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=train_batch_size,
+        shuffle=True,
+        num_workers=4,
+        collate_fn=collate_fn,
+    )
+    train_sample_save_path = os.path.join(logdir, "train_samples.gif")
+    log_train_samples(save_path=train_sample_save_path, train_dataloader=train_dataloader)
+    unet, train_dataloader  = accelerator.prepare(
+        unet, train_dataloader
+    )
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+        print('use fp16')
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    # Move text_encode and vae to gpu.
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # These models are only used for inference, keeping weights in full precision is not required.
+    vae.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("video")  # , config=vars(args))
+    logger.info("***** wait to fix the logger path *****")
+    if validation_sample_logger_config is not None and accelerator.is_main_process:
+        validation_sample_logger = p2pSampleLogger(**validation_sample_logger_config, logdir=logdir)
+        # validation_sample_logger.log_sample_images(
+        #     pipeline=pipeline,
+        #     device=accelerator.device,
+        #     step=0,
+        # )
+    def make_data_yielder(dataloader):
+        while True:
+            for batch in dataloader:
+                yield batch
+            accelerator.wait_for_everyone()
+    train_data_yielder = make_data_yielder(train_dataloader)
+    batch = next(train_data_yielder)
+    if validation_sample_logger_config.get('use_train_latents', False):
+        # Precompute the latents for this video to align the initial latents in training and test
+        assert batch["images"].shape[0] == 1, "Only support, overfiting on a single video"
+        # we only inference for latents, no training
+        vae.eval()
+        text_encoder.eval()
+        unet.eval()
+        text_embeddings = pipeline._encode_prompt(
+                train_dataset.prompt,
+                device = accelerator.device,
+                num_images_per_prompt = 1,
+                do_classifier_free_guidance = True,
+                negative_prompt=None
+        )
+        use_inversion_attention =  validation_sample_logger_config.get('use_inversion_attention', False)
+        batch['latents_all_step'] = pipeline.prepare_latents_ddim_inverted(
+            rearrange(batch["images"].to(dtype=weight_dtype), "b c f h w -> (b f) c h w"),
+            batch_size = 1,
+            num_images_per_prompt = 1,  # not sure how to use it
+            text_embeddings = text_embeddings,
+            prompt = train_dataset.prompt,
+            store_attention=use_inversion_attention,
+            LOW_RESOURCE = True, # not classifier-free guidance
+            save_path = logdir if verbose else None
+            )
+        batch['ddim_init_latents'] = batch['latents_all_step'][-1]
+    else:
+        batch['ddim_init_latents'] = None
+    vae.eval()
+    text_encoder.eval()
+    unet.eval()
+    # with accelerator.accumulate(unet):
+    # Convert images to latent space
+    images = batch["images"].to(dtype=weight_dtype)
+    images = rearrange(images, "b c f h w -> (b f) c h w")
+    if accelerator.is_main_process:
+        if validation_sample_logger is not None:
+            unet.eval()
+            samples_all, save_path = validation_sample_logger.log_sample_images(
+                image=images, # torch.Size([8, 3, 512, 512])
+                pipeline=pipeline,
+                device=accelerator.device,
+                step=0,
+                latents = batch['ddim_init_latents'],
+                save_dir = logdir if verbose else None
+            )
+        # accelerator.log(logs, step=step)
+    print('accelerator.end_training()')
+    accelerator.end_training()
+    return save_path
+# @click.command()
+# @click.option("--config", type=str, default="FateZero/config/low_resource_teaser/jeep_watercolor_ddim_10_steps.yaml")
+def run(config='FateZero/config/low_resource_teaser/jeep_watercolor_ddim_10_steps.yaml'):
+    print(f'in run function {config}')
+    Omegadict = OmegaConf.load(config)
+    if 'unet' in os.listdir(Omegadict['pretrained_model_path']):
+        test(config=config, **Omegadict)
+        print('test finished')
+        return '/home/cqiaa/diffusion/hugging_face/Tune-A-Video-inference/FateZero/result/low_resource_teaser/jeep_watercolor_ddim_10_steps_230327-200651/sample/step_0_0_0.mp4'
+    else:
+        # Go through all ckpt if possible
+        checkpoint_list = sorted(glob(os.path.join(Omegadict['pretrained_model_path'], 'checkpoint_*')))
+        print('checkpoint to evaluate:')
+        for checkpoint in checkpoint_list:
+            epoch = checkpoint.split('_')[-1]
+        for checkpoint in tqdm(checkpoint_list):
+            epoch = checkpoint.split('_')[-1]
+            if 'pretrained_epoch_list' not in Omegadict or int(epoch) in Omegadict['pretrained_epoch_list']:
+                print(f'Evaluate {checkpoint}')
+                # Update saving dir and ckpt
+                Omegadict_checkpoint = copy.deepcopy(Omegadict)
+                Omegadict_checkpoint['pretrained_model_path'] = checkpoint
+                if 'logdir' not in Omegadict_checkpoint:
+                    logdir = config.replace('config', 'result').replace('.yml', '').replace('.yaml', '')
+                    logdir +=  f"/{os.path.basename(checkpoint)}"
+                Omegadict_checkpoint['logdir'] = logdir
+                print(f'Saving at {logdir}')
+                test(config=config, **Omegadict_checkpoint)
+if __name__ == "__main__":
+    run('FateZero/config/teaser/jeep_watercolor.yaml')

FateZero/test_fatezero_dataset.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from test_fatezero import *
+from glob import glob
+import copy
+@click.command()
+@click.option("--edit_config", type=str,    default="config/supp/style/0313_style_edit_warp_640.yaml")
+@click.option("--dataset_config", type=str, default="data/supp_edit_dataset/dataset_prompt.yaml")
+def run(edit_config, dataset_config):
+    Omegadict_edit_config = OmegaConf.load(edit_config)
+    Omegadict_dataset_config = OmegaConf.load(dataset_config)
+    # Go trough all data sample
+    data_sample_list = sorted(Omegadict_dataset_config.keys())
+    print(f'Datasample to evaluate: {data_sample_list}')
+    dataset_time_string = get_time_string()
+    for data_sample in data_sample_list:
+        print(f'Evaluate {data_sample}')
+        for p2p_config_index, p2p_config in Omegadict_edit_config['validation_sample_logger_config']['p2p_config'].items():
+            edit_config_now = copy.deepcopy(Omegadict_edit_config)
+            edit_config_now['train_dataset'] = copy.deepcopy(Omegadict_dataset_config[data_sample])
+            edit_config_now['train_dataset'].pop('target')
+            if 'eq_params' in edit_config_now['train_dataset']:
+                edit_config_now['train_dataset'].pop('eq_params')
+            # edit_config_now['train_dataset']['prompt'] = Omegadict_dataset_config[data_sample]['source']
+            edit_config_now['validation_sample_logger_config']['prompts'] \
+                = copy.deepcopy( [Omegadict_dataset_config[data_sample]['prompt'],]+ OmegaConf.to_object(Omegadict_dataset_config[data_sample]['target']))
+            p2p_config_now = dict()
+            for i in range(len(edit_config_now['validation_sample_logger_config']['prompts'])):
+                p2p_config_now[i] = p2p_config
+                if 'eq_params' in Omegadict_dataset_config[data_sample]:
+                    p2p_config_now[i]['eq_params'] = Omegadict_dataset_config[data_sample]['eq_params']
+            edit_config_now['validation_sample_logger_config']['p2p_config'] = copy.deepcopy(p2p_config_now)
+            edit_config_now['validation_sample_logger_config']['source_prompt'] = Omegadict_dataset_config[data_sample]['prompt']
+            # edit_config_now['validation_sample_logger_config']['source_prompt'] = Omegadict_dataset_config[data_sample]['eq_params']
+            # if 'logdir' not in edit_config_now:
+            logdir = edit_config.replace('config', 'result').replace('.yml', '').replace('.yaml', '')+f'_config_{p2p_config_index}'+f'_{os.path.basename(dataset_config)[:-5]}'+f'_{dataset_time_string}'
+            logdir +=  f"/{data_sample}"
+            edit_config_now['logdir'] = logdir
+            print(f'Saving at {logdir}')
+            test(config=edit_config, **edit_config_now)
+if __name__ == "__main__":
+    run()

FateZero/test_install.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import torch
+import os
+import sys
+print(f"python version {sys.version}")
+print(f"torch version {torch.__version__}")
+print(f"validate gpu status:")
+print( torch.tensor(1.0).cuda()*2)
+os.system("nvcc --version")
+import diffusers
+print(diffusers.__version__)
+print(diffusers.__file__)
+try:
+    import bitsandbytes
+    print(bitsandbytes.__file__)
+except:
+    print("fail to import bitsandbytes")
+os.system("accelerate env")
+os.system("python -m xformers.info")

FateZero/train_tune_a_video.py ADDED Viewed

	@@ -0,0 +1,426 @@

+import os,copy
+import inspect
+from typing import Optional, List, Dict, Union
+import PIL
+import click
+from omegaconf import OmegaConf
+import torch
+import torch.utils.data
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from accelerate import Accelerator
+from accelerate.utils import set_seed
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    DDIMScheduler,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.pipeline_utils import DiffusionPipeline
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, CLIPTextModel
+from einops import rearrange
+from video_diffusion.models.unet_3d_condition import UNetPseudo3DConditionModel
+from video_diffusion.data.dataset import ImageSequenceDataset
+from video_diffusion.common.util import get_time_string, get_function_args
+from video_diffusion.common.logger import get_logger_config_path
+from video_diffusion.common.image_util import log_train_samples, log_train_reg_samples
+from video_diffusion.common.instantiate_from_config import instantiate_from_config, get_obj_from_str
+from video_diffusion.pipelines.validation_loop import SampleLogger
+def collate_fn(examples):
+    batch = {
+        "prompt_ids": torch.cat([example["prompt_ids"] for example in examples], dim=0),
+        "images": torch.stack([example["images"] for example in examples]),
+    }
+    if "class_images" in examples[0]:
+        batch["class_prompt_ids"] = torch.cat([example["class_prompt_ids"] for example in examples], dim=0)
+        batch["class_images"] =  torch.stack([example["class_images"] for example in examples])
+    return batch
+def train(
+    config: str,
+    pretrained_model_path: str,
+    train_dataset: Dict,
+    logdir: str = None,
+    train_steps: int = 300,
+    validation_steps: int = 1000,
+    validation_sample_logger_config: Optional[Dict] = None,
+    test_pipeline_config: Optional[Dict] = dict(),
+    trainer_pipeline_config: Optional[Dict] = dict(),
+    gradient_accumulation_steps: int = 1,
+    seed: Optional[int] = None,
+    mixed_precision: Optional[str] = "fp16",
+    enable_xformers: bool = True,
+    train_batch_size: int = 1,
+    learning_rate: float = 3e-5,
+    scale_lr: bool = False,
+    lr_scheduler: str = "constant",  # ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]
+    lr_warmup_steps: int = 0,
+    use_8bit_adam: bool = True,
+    adam_beta1: float = 0.9,
+    adam_beta2: float = 0.999,
+    adam_weight_decay: float = 1e-2,
+    adam_epsilon: float = 1e-08,
+    max_grad_norm: float = 1.0,
+    gradient_checkpointing: bool = False,
+    train_temporal_conv: bool = False,
+    checkpointing_steps: int = 1000,
+    model_config: dict={},
+    # use_train_latents: bool=False,
+    # kwr
+    # **kwargs
+):
+    args = get_function_args()
+    # args.update(kwargs)
+    train_dataset_config = copy.deepcopy(train_dataset)
+    time_string = get_time_string()
+    if logdir is None:
+        logdir = config.replace('config', 'result').replace('.yml', '').replace('.yaml', '')
+    logdir += f"_{time_string}"
+    accelerator = Accelerator(
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        mixed_precision=mixed_precision,
+    )
+    if accelerator.is_main_process:
+        os.makedirs(logdir, exist_ok=True)
+        OmegaConf.save(args, os.path.join(logdir, "config.yml"))
+    logger = get_logger_config_path(logdir)
+    if seed is not None:
+        set_seed(seed)
+    # Load the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        pretrained_model_path,
+        subfolder="tokenizer",
+        use_fast=False,
+    )
+    # Load models and create wrapper for stable diffusion
+    text_encoder = CLIPTextModel.from_pretrained(
+        pretrained_model_path,
+        subfolder="text_encoder",
+    )
+    vae = AutoencoderKL.from_pretrained(
+        pretrained_model_path,
+        subfolder="vae",
+    )
+    unet = UNetPseudo3DConditionModel.from_2d_model(
+        os.path.join(pretrained_model_path, "unet"), model_config=model_config
+    )
+    if 'target' not in test_pipeline_config:
+        test_pipeline_config['target'] = 'video_diffusion.pipelines.stable_diffusion.SpatioTemporalStableDiffusionPipeline'
+    pipeline = instantiate_from_config(
+        test_pipeline_config,
+        vae=vae,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        unet=unet,
+        scheduler=DDIMScheduler.from_pretrained(
+            pretrained_model_path,
+            subfolder="scheduler",
+        ),
+    )
+    pipeline.scheduler.set_timesteps(validation_sample_logger_config['num_inference_steps'])
+    pipeline.set_progress_bar_config(disable=True)
+    if is_xformers_available() and enable_xformers:
+    # if False:  # Disable xformers for null inversion
+        try:
+            pipeline.enable_xformers_memory_efficient_attention()
+            print('enable xformers in the training and testing')
+        except Exception as e:
+            logger.warning(
+                "Could not enable memory efficient attention. Make sure xformers is installed"
+                f" correctly and a GPU is available: {e}"
+            )
+    vae.requires_grad_(False)
+    unet.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    # Start of config trainable parameters in Unet and optimizer
+    trainable_modules = ("attn_temporal", ".to_q")
+    if train_temporal_conv:
+        trainable_modules += ("conv_temporal",)
+    for name, module in unet.named_modules():
+        if name.endswith(trainable_modules):
+            for params in module.parameters():
+                params.requires_grad = True
+    if gradient_checkpointing:
+        print('enable gradient checkpointing in the training and testing')
+        unet.enable_gradient_checkpointing()
+    if scale_lr:
+        learning_rate = (
+            learning_rate * gradient_accumulation_steps * train_batch_size * accelerator.num_processes
+        )
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+    params_to_optimize = unet.parameters()
+    num_trainable_modules = 0
+    num_trainable_params = 0
+    num_unet_params = 0
+    for params in params_to_optimize:
+        num_unet_params += params.numel()
+        if params.requires_grad == True:
+            num_trainable_modules +=1
+            num_trainable_params += params.numel()
+    logger.info(f"Num of trainable modules: {num_trainable_modules}")
+    logger.info(f"Num of trainable params: {num_trainable_params/(1024*1024):.2f} M")
+    logger.info(f"Num of unet params: {num_unet_params/(1024*1024):.2f} M ")
+    params_to_optimize = unet.parameters()
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=learning_rate,
+        betas=(adam_beta1, adam_beta2),
+        weight_decay=adam_weight_decay,
+        eps=adam_epsilon,
+    )
+    # End of config trainable parameters in Unet and optimizer
+    prompt_ids = tokenizer(
+        train_dataset["prompt"],
+        truncation=True,
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        return_tensors="pt",
+    ).input_ids
+    if 'class_data_root' in train_dataset_config:
+        if 'class_data_prompt' not in train_dataset_config:
+            train_dataset_config['class_data_prompt'] = train_dataset_config['prompt']
+        class_prompt_ids = tokenizer(
+            train_dataset_config["class_data_prompt"],
+            truncation=True,
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+    else:
+        class_prompt_ids = None
+    train_dataset = ImageSequenceDataset(**train_dataset, prompt_ids=prompt_ids, class_prompt_ids=class_prompt_ids)
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=train_batch_size,
+        shuffle=True,
+        num_workers=16,
+        collate_fn=collate_fn,
+    )
+    train_sample_save_path = os.path.join(logdir, "train_samples.gif")
+    log_train_samples(save_path=train_sample_save_path, train_dataloader=train_dataloader)
+    if 'class_data_root' in train_dataset_config:
+        log_train_reg_samples(save_path=train_sample_save_path.replace('train_samples', 'class_data_samples'), train_dataloader=train_dataloader)
+    # Prepare learning rate scheduler in accelerate config
+    lr_scheduler = get_scheduler(
+        lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=lr_warmup_steps * gradient_accumulation_steps,
+        num_training_steps=train_steps * gradient_accumulation_steps,
+    )
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+    accelerator.register_for_checkpointing(lr_scheduler)
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+        print('enable float16 in the training and testing')
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    # Move text_encode and vae to gpu.
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    vae.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("video")  # , config=vars(args))
+    # Start of config trainer
+    trainer = instantiate_from_config(
+        trainer_pipeline_config,
+        vae=vae,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        unet=unet,
+        scheduler= DDPMScheduler.from_pretrained(
+            pretrained_model_path,
+            subfolder="scheduler",
+            ),
+        # training hyperparams
+        weight_dtype=weight_dtype,
+        accelerator=accelerator,
+        optimizer=optimizer,
+        max_grad_norm=max_grad_norm,
+        lr_scheduler=lr_scheduler,
+        prior_preservation=None
+    )
+    trainer.print_pipeline(logger)
+    # Train!
+    total_batch_size = train_batch_size * accelerator.num_processes * gradient_accumulation_steps
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Instantaneous batch size per device = {train_batch_size}")
+    logger.info(
+        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
+    )
+    logger.info(f"  Gradient Accumulation steps = {gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {train_steps}")
+    step = 0
+    # End of config trainer
+    if validation_sample_logger_config is not None and accelerator.is_main_process:
+        validation_sample_logger = SampleLogger(**validation_sample_logger_config, logdir=logdir)
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(
+        range(step, train_steps),
+        disable=not accelerator.is_local_main_process,
+    )
+    progress_bar.set_description("Steps")
+    def make_data_yielder(dataloader):
+        while True:
+            for batch in dataloader:
+                yield batch
+            accelerator.wait_for_everyone()
+    train_data_yielder = make_data_yielder(train_dataloader)
+    assert(train_dataset.overfit_length == 1), "Only support overfiting on a single video"
+    # batch = next(train_data_yielder)
+    while step < train_steps:
+        batch = next(train_data_yielder)
+        """************************* start of an iteration*******************************"""
+        loss = trainer.step(batch)
+        # torch.cuda.empty_cache()
+        """************************* end of an iteration*******************************"""
+        # Checks if the accelerator has performed an optimization step behind the scenes
+        if accelerator.sync_gradients:
+            progress_bar.update(1)
+            step += 1
+            if accelerator.is_main_process:
+                if validation_sample_logger is not None and (step % validation_steps == 0):
+                    unet.eval()
+                    val_image = rearrange(batch["images"].to(dtype=weight_dtype), "b c f h w -> (b f) c h w")
+                    # Unet is changing in different iteration; we should invert online
+                    if validation_sample_logger_config.get('use_train_latents', False):
+                        # Precompute the latents for this video to align the initial latents in training and test
+                        assert batch["images"].shape[0] == 1, "Only support, overfiting on a single video"
+                        # we only inference for latents, no training
+                        vae.eval()
+                        text_encoder.eval()
+                        unet.eval()
+                        text_embeddings = pipeline._encode_prompt(
+                                train_dataset.prompt,
+                                device = accelerator.device,
+                                num_images_per_prompt = 1,
+                                do_classifier_free_guidance = True,
+                                negative_prompt=None
+                        )
+                        batch['latents_all_step'] = pipeline.prepare_latents_ddim_inverted(
+                            rearrange(batch["images"].to(dtype=weight_dtype), "b c f h w -> (b f) c h w"),
+                            batch_size = 1 ,
+                            num_images_per_prompt = 1,  # not sure how to use it
+                            text_embeddings = text_embeddings
+                            )
+                        batch['ddim_init_latents'] = batch['latents_all_step'][-1]
+                    else:
+                        batch['ddim_init_latents'] = None
+                    validation_sample_logger.log_sample_images(
+                        # image=rearrange(train_dataset.get_all()["images"].to(accelerator.device, dtype=weight_dtype), "c f h w -> f c h w"), # torch.Size([8, 3, 512, 512])
+                        image= val_image, # torch.Size([8, 3, 512, 512])
+                        pipeline=pipeline,
+                        device=accelerator.device,
+                        step=step,
+                        latents = batch['ddim_init_latents'],
+                    )
+                    torch.cuda.empty_cache()
+                    unet.train()
+                if step % checkpointing_steps == 0:
+                    accepts_keep_fp32_wrapper = "keep_fp32_wrapper" in set(
+                        inspect.signature(accelerator.unwrap_model).parameters.keys()
+                    )
+                    extra_args = {"keep_fp32_wrapper": True} if accepts_keep_fp32_wrapper else {}
+                    pipeline_save = get_obj_from_str(test_pipeline_config["target"]).from_pretrained(
+                        pretrained_model_path,
+                        unet=accelerator.unwrap_model(unet, **extra_args),
+                    )
+                    checkpoint_save_path = os.path.join(logdir, f"checkpoint_{step}")
+                    pipeline_save.save_pretrained(checkpoint_save_path)
+        logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+        progress_bar.set_postfix(**logs)
+        accelerator.log(logs, step=step)
+    accelerator.end_training()
+@click.command()
+@click.option("--config", type=str, default="config/sample.yml")
+def run(config):
+    train(config=config, **OmegaConf.load(config))
+if __name__ == "__main__":
+    run()

FateZero/video_diffusion/common/image_util.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import os
+import math
+import textwrap
+import imageio
+import numpy as np
+from typing import Sequence
+import requests
+import cv2
+from PIL import Image, ImageDraw, ImageFont
+import torch
+from torchvision import transforms
+from einops import rearrange
+IMAGE_EXTENSION = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp")
+FONT_URL = "https://raw.github.com/googlefonts/opensans/main/fonts/ttf/OpenSans-Regular.ttf"
+FONT_PATH = "./docs/OpenSans-Regular.ttf"
+def pad(image: Image.Image, top=0, right=0, bottom=0, left=0, color=(255, 255, 255)) -> Image.Image:
+    new_image = Image.new(image.mode, (image.width + right + left, image.height + top + bottom), color)
+    new_image.paste(image, (left, top))
+    return new_image
+def download_font_opensans(path=FONT_PATH):
+    font_url = FONT_URL
+    response = requests.get(font_url)
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    with open(path, "wb") as f:
+        f.write(response.content)
+def annotate_image_with_font(image: Image.Image, text: str, font: ImageFont.FreeTypeFont) -> Image.Image:
+    image_w = image.width
+    _, _, text_w, text_h = font.getbbox(text)
+    line_size = math.floor(len(text) * image_w / text_w)
+    lines = textwrap.wrap(text, width=line_size)
+    padding = text_h * len(lines)
+    image = pad(image, top=padding + 3)
+    ImageDraw.Draw(image).text((0, 0), "\n".join(lines), fill=(0, 0, 0), font=font)
+    return image
+def annotate_image(image: Image.Image, text: str, font_size: int = 15):
+    if not os.path.isfile(FONT_PATH):
+        download_font_opensans()
+    font = ImageFont.truetype(FONT_PATH, size=font_size)
+    return annotate_image_with_font(image=image, text=text, font=font)
+def make_grid(images: Sequence[Image.Image], rows=None, cols=None) -> Image.Image:
+    if isinstance(images[0], np.ndarray):
+        images = [Image.fromarray(i) for i in images]
+    if rows is None:
+        assert cols is not None
+        rows = math.ceil(len(images) / cols)
+    else:
+        cols = math.ceil(len(images) / rows)
+    w, h = images[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    for i, image in enumerate(images):
+        if image.size != (w, h):
+            image = image.resize((w, h))
+        grid.paste(image, box=(i % cols * w, i // cols * h))
+    return grid
+def save_images_as_gif(
+    images: Sequence[Image.Image],
+    save_path: str,
+    loop=0,
+    duration=100,
+    optimize=False,
+) -> None:
+    images[0].save(
+        save_path,
+        save_all=True,
+        append_images=images[1:],
+        optimize=optimize,
+        loop=loop,
+        duration=duration,
+    )
+def save_images_as_mp4(
+    images: Sequence[Image.Image],
+    save_path: str,
+) -> None:
+    # images[0].save(
+    #     save_path,
+    #     save_all=True,
+    #     append_images=images[1:],
+    #     optimize=optimize,
+    #     loop=loop,
+    #     duration=duration,
+    # )
+    writer_edit = imageio.get_writer(
+        save_path,
+        fps=10)
+    for i in images:
+        init_image = i.convert("RGB")
+        writer_edit.append_data(np.array(init_image))
+    writer_edit.close()
+def save_images_as_folder(
+    images: Sequence[Image.Image],
+    save_path: str,
+) -> None:
+    os.makedirs(save_path, exist_ok=True)
+    for index, image in enumerate(images):
+        init_image = image
+        if len(np.array(init_image).shape) == 3:
+            cv2.imwrite(os.path.join(save_path, f"{index:05d}.png"), np.array(init_image)[:, :, ::-1])
+        else:
+            cv2.imwrite(os.path.join(save_path, f"{index:05d}.png"), np.array(init_image))
+def log_train_samples(
+    train_dataloader,
+    save_path,
+    num_batch: int = 4,
+):
+    train_samples = []
+    for idx, batch in enumerate(train_dataloader):
+        if idx >= num_batch:
+            break
+        train_samples.append(batch["images"])
+    train_samples = torch.cat(train_samples).numpy()
+    train_samples = rearrange(train_samples, "b c f h w -> b f h w c")
+    train_samples = (train_samples * 0.5 + 0.5).clip(0, 1)
+    train_samples = numpy_batch_seq_to_pil(train_samples)
+    train_samples = [make_grid(images, cols=int(np.ceil(np.sqrt(len(train_samples))))) for images in zip(*train_samples)]
+    # save_images_as_gif(train_samples, save_path)
+    save_gif_mp4_folder_type(train_samples, save_path)
+def log_train_reg_samples(
+    train_dataloader,
+    save_path,
+    num_batch: int = 4,
+):
+    train_samples = []
+    for idx, batch in enumerate(train_dataloader):
+        if idx >= num_batch:
+            break
+        train_samples.append(batch["class_images"])
+    train_samples = torch.cat(train_samples).numpy()
+    train_samples = rearrange(train_samples, "b c f h w -> b f h w c")
+    train_samples = (train_samples * 0.5 + 0.5).clip(0, 1)
+    train_samples = numpy_batch_seq_to_pil(train_samples)
+    train_samples = [make_grid(images, cols=int(np.ceil(np.sqrt(len(train_samples))))) for images in zip(*train_samples)]
+    # save_images_as_gif(train_samples, save_path)
+    save_gif_mp4_folder_type(train_samples, save_path)
+def save_gif_mp4_folder_type(images, save_path, save_gif=False):
+    if isinstance(images[0], np.ndarray):
+        images = [Image.fromarray(i) for i in images]
+    elif isinstance(images[0], torch.Tensor):
+        images = [transforms.ToPILImage()(i.cpu().clone()[0]) for i in images]
+    save_path_mp4 = save_path.replace('gif', 'mp4')
+    save_path_folder = save_path.replace('.gif', '')
+    if save_gif: save_images_as_gif(images, save_path)
+    save_images_as_mp4(images, save_path_mp4)
+    save_images_as_folder(images, save_path_folder)
+# copy from video_diffusion/pipelines/stable_diffusion.py
+def numpy_seq_to_pil(images):
+    """
+    Convert a numpy image or a batch of images to a PIL image.
+    """
+    if images.ndim == 3:
+        images = images[None, ...]
+    images = (images * 255).round().astype("uint8")
+    if images.shape[-1] == 1:
+        # special case for grayscale (single channel) images
+        pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+    else:
+        pil_images = [Image.fromarray(image) for image in images]
+    return pil_images
+# copy from diffusers-0.11.1/src/diffusers/pipeline_utils.py
+def numpy_batch_seq_to_pil(images):
+    pil_images = []
+    for sequence in images:
+        pil_images.append(numpy_seq_to_pil(sequence))
+    return pil_images

FateZero/video_diffusion/common/instantiate_from_config.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+Copy from stable diffusion
+"""
+import importlib
+def instantiate_from_config(config:dict, **args_from_code):
+    """Util funciton to decompose differenct modules using config
+    Args:
+        config (dict): with key of "target" and "params", better from yaml
+        static
+        args_from_code: additional con
+    Returns:
+        a validation/training pipeline, a module
+    """
+    if not "target" in config:
+        if config == '__is_first_stage__':
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()), **args_from_code)
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)

FateZero/video_diffusion/common/logger.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os
+import logging, logging.handlers
+from accelerate.logging import get_logger
+def get_logger_config_path(logdir):
+    # accelerate handles the logger in multiprocessing
+    logger = get_logger(__name__)
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s:%(levelname)s : %(message)s',
+        datefmt='%a, %d %b %Y %H:%M:%S',
+        filename=os.path.join(logdir, 'log.log'),
+        filemode='w')
+    chlr = logging.StreamHandler()
+    chlr.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s : %(message)s'))
+    logger.logger.addHandler(chlr)
+    return logger

FateZero/video_diffusion/common/set_seed.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
+import torch
+import numpy as np
+import random
+from accelerate.utils import set_seed
+def video_set_seed(seed: int):
+    """
+    Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.
+    Args:
+        seed (`int`): The seed to set.
+        device_specific (`bool`, *optional*, defaults to `False`):
+            Whether to differ the seed on each device slightly with `self.process_index`.
+    """
+    set_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.benchmark = False
+    # torch.use_deterministic_algorithms(True, warn_only=True)
+    # [W Context.cpp:82] Warning: efficient_attention_forward_cutlass does not have a deterministic implementation, but you set 'torch.use_deterministic_algorithms(True, warn_only=True)'. You can file an issue at https://github.com/pytorch/pytorch/issues to help us prioritize adding deterministic support for this operation. (function alertNotDeterministic)

FateZero/video_diffusion/common/util.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import sys
+import copy
+import inspect
+import datetime
+from typing import List, Tuple, Optional, Dict
+def glob_files(
+    root_path: str,
+    extensions: Tuple[str],
+    recursive: bool = True,
+    skip_hidden_directories: bool = True,
+    max_directories: Optional[int] = None,
+    max_files: Optional[int] = None,
+    relative_path: bool = False,
+) -> Tuple[List[str], bool, bool]:
+    """glob files with specified extensions
+    Args:
+        root_path (str): _description_
+        extensions (Tuple[str]): _description_
+        recursive (bool, optional): _description_. Defaults to True.
+        skip_hidden_directories (bool, optional): _description_. Defaults to True.
+        max_directories (Optional[int], optional): max number of directories to search. Defaults to None.
+        max_files (Optional[int], optional): max file number limit. Defaults to None.
+        relative_path (bool, optional): _description_. Defaults to False.
+    Returns:
+        Tuple[List[str], bool, bool]: _description_
+    """
+    paths = []
+    hit_max_directories = False
+    hit_max_files = False
+    for directory_idx, (directory, _, fnames) in enumerate(os.walk(root_path, followlinks=True)):
+        if skip_hidden_directories and os.path.basename(directory).startswith("."):
+            continue
+        if max_directories is not None and directory_idx >= max_directories:
+            hit_max_directories = True
+            break
+        paths += [
+            os.path.join(directory, fname)
+            for fname in sorted(fnames)
+            if fname.lower().endswith(extensions)
+        ]
+        if not recursive:
+            break
+        if max_files is not None and len(paths) > max_files:
+            hit_max_files = True
+            paths = paths[:max_files]
+            break
+    if relative_path:
+        paths = [os.path.relpath(p, root_path) for p in paths]
+    return paths, hit_max_directories, hit_max_files
+def get_time_string() -> str:
+    x = datetime.datetime.now()
+    return f"{(x.year - 2000):02d}{x.month:02d}{x.day:02d}-{x.hour:02d}{x.minute:02d}{x.second:02d}"
+def get_function_args() -> Dict:
+    frame = sys._getframe(1)
+    args, _, _, values = inspect.getargvalues(frame)
+    args_dict = copy.deepcopy({arg: values[arg] for arg in args})
+    return args_dict

FateZero/video_diffusion/data/dataset.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import os
+import numpy as np
+from PIL import Image
+from einops import rearrange
+from pathlib import Path
+import torch
+from torch.utils.data import Dataset
+from .transform import short_size_scale, random_crop, center_crop, offset_crop
+from ..common.image_util import IMAGE_EXTENSION
+import sys
+sys.path.append('FateZero')
+class ImageSequenceDataset(Dataset):
+    def __init__(
+        self,
+        path: str,
+        prompt_ids: torch.Tensor,
+        prompt: str,
+        start_sample_frame: int=0,
+        n_sample_frame: int = 8,
+        sampling_rate: int = 1,
+        stride: int = 1,
+        image_mode: str = "RGB",
+        image_size: int = 512,
+        crop: str = "center",
+        class_data_root: str = None,
+        class_prompt_ids: torch.Tensor = None,
+        offset: dict = {
+            "left": 0,
+            "right": 0,
+            "top": 0,
+            "bottom": 0
+        }
+    ):
+        self.path = path
+        self.images = self.get_image_list(path)
+        self.n_images = len(self.images)
+        self.offset = offset
+        if n_sample_frame < 0:
+            n_sample_frame = len(self.images)
+        self.start_sample_frame = start_sample_frame
+        self.n_sample_frame = n_sample_frame
+        self.sampling_rate = sampling_rate
+        self.sequence_length = (n_sample_frame - 1) * sampling_rate + 1
+        if self.n_images < self.sequence_length:
+            raise ValueError("self.n_images < self.sequence_length")
+        self.stride = stride
+        self.image_mode = image_mode
+        self.image_size = image_size
+        crop_methods = {
+            "center": center_crop,
+            "random": random_crop,
+        }
+        if crop not in crop_methods:
+            raise ValueError
+        self.crop = crop_methods[crop]
+        self.prompt = prompt
+        self.prompt_ids = prompt_ids
+        self.overfit_length = (self.n_images - self.sequence_length) // self.stride + 1
+        # Negative prompt for regularization
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_images_path = sorted(list(self.class_data_root.iterdir()))
+            self.num_class_images = len(self.class_images_path)
+            self.class_prompt_ids = class_prompt_ids
+        self.video_len = (self.n_images - self.sequence_length) // self.stride + 1
+    def __len__(self):
+        max_len = (self.n_images - self.sequence_length) // self.stride + 1
+        if hasattr(self, 'num_class_images'):
+            max_len = max(max_len, self.num_class_images)
+        # return (self.n_images - self.sequence_length) // self.stride + 1
+        return max_len
+    def __getitem__(self, index):
+        return_batch = {}
+        frame_indices = self.get_frame_indices(index%self.video_len)
+        frames = [self.load_frame(i) for i in frame_indices]
+        frames = self.transform(frames)
+        return_batch.update(
+            {
+            "images": frames,
+            "prompt_ids": self.prompt_ids,
+            }
+        )
+        if hasattr(self, 'class_data_root'):
+            class_index = index % (self.num_class_images - self.n_sample_frame)
+            class_indices = self.get_class_indices(class_index)
+            frames = [self.load_class_frame(i) for i in class_indices]
+            return_batch["class_images"] = self.tensorize_frames(frames)
+            return_batch["class_prompt_ids"] = self.class_prompt_ids
+        return return_batch
+    def get_all(self, val_length=None):
+        if val_length is None:
+            val_length = len(self.images)
+        frame_indices = (i for i in range(val_length))
+        frames = [self.load_frame(i) for i in frame_indices]
+        frames = self.transform(frames)
+        return {
+            "images": frames,
+            "prompt_ids": self.prompt_ids,
+        }
+    def transform(self, frames):
+        frames = self.tensorize_frames(frames)
+        frames = offset_crop(frames, **self.offset)
+        frames = short_size_scale(frames, size=self.image_size)
+        frames = self.crop(frames, height=self.image_size, width=self.image_size)
+        return frames
+    @staticmethod
+    def tensorize_frames(frames):
+        frames = rearrange(np.stack(frames), "f h w c -> c f h w")
+        return torch.from_numpy(frames).div(255) * 2 - 1
+    def load_frame(self, index):
+        image_path = os.path.join(self.path, self.images[index])
+        return Image.open(image_path).convert(self.image_mode)
+    def load_class_frame(self, index):
+        image_path = self.class_images_path[index]
+        return Image.open(image_path).convert(self.image_mode)
+    def get_frame_indices(self, index):
+        if self.start_sample_frame is not None:
+            frame_start = self.start_sample_frame + self.stride * index
+        else:
+            frame_start = self.stride * index
+        return (frame_start + i * self.sampling_rate for i in range(self.n_sample_frame))
+    def get_class_indices(self, index):
+        frame_start = index
+        return (frame_start + i  for i in range(self.n_sample_frame))
+    @staticmethod
+    def get_image_list(path):
+        images = []
+        for file in sorted(os.listdir(path)):
+            if file.endswith(IMAGE_EXTENSION):
+                images.append(file)
+        return images

FateZero/video_diffusion/data/transform.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import random
+import torch
+def short_size_scale(images, size):
+    h, w = images.shape[-2:]
+    short, long = (h, w) if h < w else (w, h)
+    scale = size / short
+    long_target = int(scale * long)
+    target_size = (size, long_target) if h < w else (long_target, size)
+    return torch.nn.functional.interpolate(
+        input=images, size=target_size, mode="bilinear", antialias=True
+    )
+def random_short_side_scale(images, size_min, size_max):
+    size = random.randint(size_min, size_max)
+    return short_size_scale(images, size)
+def random_crop(images, height, width):
+    image_h, image_w = images.shape[-2:]
+    h_start = random.randint(0, image_h - height)
+    w_start = random.randint(0, image_w - width)
+    return images[:, :, h_start : h_start + height, w_start : w_start + width]
+def center_crop(images, height, width):
+    # offset_crop(images, 0,0, 200, 0)
+    image_h, image_w = images.shape[-2:]
+    h_start = (image_h - height) // 2
+    w_start = (image_w - width) // 2
+    return images[:, :, h_start : h_start + height, w_start : w_start + width]
+def offset_crop(image, left=0, right=0, top=200, bottom=0):
+    n, c, h, w = image.shape
+    left = min(left, w-1)
+    right = min(right, w - left - 1)
+    top = min(top, h - 1)
+    bottom = min(bottom, h - top - 1)
+    image = image[:, :, top:h-bottom, left:w-right]
+    return image

FateZero/video_diffusion/models/attention.py ADDED Viewed

	@@ -0,0 +1,482 @@

+# code mostly taken from https://github.com/huggingface/diffusers
+from dataclasses import dataclass
+from typing import Optional
+import torch
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.modeling_utils import ModelMixin
+from diffusers.models.attention import FeedForward, CrossAttention, AdaLayerNorm
+from diffusers.utils import BaseOutput
+from diffusers.utils.import_utils import is_xformers_available
+from einops import rearrange
+@dataclass
+class SpatioTemporalTransformerModelOutput(BaseOutput):
+    """torch.FloatTensor of shape [batch x channel x frames x height x width]"""
+    sample: torch.FloatTensor
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+class SpatioTemporalTransformerModel(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        model_config: dict = {},
+        **transformer_kwargs,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        # Define input layers
+        self.in_channels = in_channels
+        self.norm = torch.nn.GroupNorm(
+            num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True
+        )
+        if use_linear_projection:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+        else:
+            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        # Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                SpatioTemporalTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    model_config=model_config,
+                    **transformer_kwargs,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        # Define output layers
+        if use_linear_projection:
+            self.proj_out = nn.Linear(in_channels, inner_dim)
+        else:
+            self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(
+        self, hidden_states, encoder_hidden_states=None, timestep=None, return_dict: bool = True
+    ):
+        # 1. Input
+        clip_length = None
+        is_video = hidden_states.ndim == 5
+        if is_video:
+            clip_length = hidden_states.shape[2]
+            hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+            encoder_hidden_states = encoder_hidden_states.repeat_interleave(clip_length, 0)
+        else:
+            # To adapt to classifier-free guidance where encoder_hidden_states=2
+            batch_size = hidden_states.shape[0]//encoder_hidden_states.shape[0]
+            encoder_hidden_states = encoder_hidden_states.repeat_interleave(batch_size, 0)
+        *_, h, w = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        if not self.use_linear_projection:
+            hidden_states = self.proj_in(hidden_states)
+            hidden_states = rearrange(hidden_states, "b c h w -> b (h w) c") # (bf) (hw) c
+        else:
+            hidden_states = rearrange(hidden_states, "b c h w -> b (h w) c")
+            hidden_states = self.proj_in(hidden_states)
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states, # [16, 4096, 320]
+                encoder_hidden_states=encoder_hidden_states, # ([1, 77, 768]
+                timestep=timestep,
+                clip_length=clip_length,
+            )
+        # 3. Output
+        if not self.use_linear_projection:
+            hidden_states = rearrange(hidden_states, "b (h w) c -> b c h w", h=h, w=w).contiguous()
+            hidden_states = self.proj_out(hidden_states)
+        else:
+            hidden_states = self.proj_out(hidden_states)
+            hidden_states = rearrange(hidden_states, "b (h w) c -> b c h w", h=h, w=w).contiguous()
+        output = hidden_states + residual
+        if is_video:
+            output = rearrange(output, "(b f) c h w -> b c f h w", f=clip_length)
+        if not return_dict:
+            return (output,)
+        return SpatioTemporalTransformerModelOutput(sample=output)
+import copy
+class SpatioTemporalTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        use_sparse_causal_attention: bool = True,
+        temporal_attention_position: str = "after_feedforward",
+        model_config: dict = {}
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.use_ada_layer_norm = num_embeds_ada_norm is not None
+        self.use_sparse_causal_attention = use_sparse_causal_attention
+        # For safety, freeze the model_config
+        self.model_config = copy.deepcopy(model_config)
+        if 'least_sc_channel' in model_config:
+            if dim< model_config['least_sc_channel']:
+                self.model_config['SparseCausalAttention_index'] = []
+        self.temporal_attention_position = temporal_attention_position
+        temporal_attention_positions = ["after_spatial", "after_cross", "after_feedforward"]
+        if temporal_attention_position not in temporal_attention_positions:
+            raise ValueError(
+                f"`temporal_attention_position` must be one of {temporal_attention_positions}"
+            )
+        # 1. Spatial-Attn
+        spatial_attention = SparseCausalAttention if use_sparse_causal_attention else CrossAttention
+        self.attn1 = spatial_attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+        )  # is a self-attention
+        self.norm1 = (
+            AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+        )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None:
+            self.attn2 = CrossAttention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )  # is self-attn if encoder_hidden_states is none
+            self.norm2 = (
+                AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+            )
+        else:
+            self.attn2 = None
+            self.norm2 = None
+        # 3. Temporal-Attn
+        self.attn_temporal = CrossAttention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            upcast_attention=upcast_attention,
+        )
+        nn.init.zeros_(self.attn_temporal.to_out[0].weight.data)  # initialize as an identity function
+        self.norm_temporal = (
+            AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+        )
+        # efficient_attention_backward_cutlass is not implemented for large channels
+        self.use_xformers = (dim <= 320) or "3090" not in torch.cuda.get_device_name(0)
+        # 4. Feed-forward
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
+        self.norm3 = nn.LayerNorm(dim)
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        if not is_xformers_available():
+            print("Here is how to install it")
+            raise ModuleNotFoundError(
+                "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                " xformers",
+                name="xformers",
+            )
+        elif not torch.cuda.is_available():
+            raise ValueError(
+                "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only"
+                " available for GPU "
+            )
+        else:
+            try:
+                # Make sure we can run the memory efficient attention
+                if use_memory_efficient_attention_xformers is True:
+                    _ = xformers.ops.memory_efficient_attention(
+                        torch.randn((1, 2, 40), device="cuda"),
+                        torch.randn((1, 2, 40), device="cuda"),
+                        torch.randn((1, 2, 40), device="cuda"),
+                    )
+                else:
+                    pass
+            except Exception as e:
+                raise e
+            # self.attn1._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+            # self.attn2._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+            self.attn1._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers and self.use_xformers
+            self.attn2._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers and self.use_xformers
+            # self.attn_temporal._use_memory_efficient_attention_xformers = (
+            #     use_memory_efficient_attention_xformers
+            # ),  # FIXME: enabling this raises CUDA ERROR. Gotta dig in.
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        timestep=None,
+        attention_mask=None,
+        clip_length=None,
+    ):
+        # 1. Self-Attention
+        norm_hidden_states = (
+            self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(hidden_states)
+        )
+        kwargs = dict(
+            hidden_states=norm_hidden_states,
+            attention_mask=attention_mask,
+        )
+        if self.only_cross_attention:
+            kwargs.update(encoder_hidden_states=encoder_hidden_states)
+        if self.use_sparse_causal_attention:
+            kwargs.update(clip_length=clip_length)
+        if 'SparseCausalAttention_index' in self.model_config.keys():
+            kwargs.update(SparseCausalAttention_index = self.model_config['SparseCausalAttention_index'])
+        hidden_states = hidden_states + self.attn1(**kwargs)
+        if clip_length is not None and self.temporal_attention_position == "after_spatial":
+            hidden_states = self.apply_temporal_attention(hidden_states, timestep, clip_length)
+        if self.attn2 is not None:
+            # 2. Cross-Attention
+            norm_hidden_states = (
+                self.norm2(hidden_states, timestep)
+                if self.use_ada_layer_norm
+                else self.norm2(hidden_states)
+            )
+            hidden_states = (
+                self.attn2(
+                    norm_hidden_states, # [16, 4096, 320]
+                    encoder_hidden_states=encoder_hidden_states, # [1, 77, 768]
+                    attention_mask=attention_mask,
+                )
+                + hidden_states
+            )
+        if clip_length is not None and self.temporal_attention_position == "after_cross":
+            hidden_states = self.apply_temporal_attention(hidden_states, timestep, clip_length)
+        # 3. Feed-forward
+        hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
+        if clip_length is not None and self.temporal_attention_position == "after_feedforward":
+            hidden_states = self.apply_temporal_attention(hidden_states, timestep, clip_length)
+        return hidden_states
+    def apply_temporal_attention(self, hidden_states, timestep, clip_length):
+        d = hidden_states.shape[1]
+        hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=clip_length)
+        norm_hidden_states = (
+            self.norm_temporal(hidden_states, timestep)
+            if self.use_ada_layer_norm
+            else self.norm_temporal(hidden_states)
+        )
+        hidden_states = self.attn_temporal(norm_hidden_states) + hidden_states
+        hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
+        return hidden_states
+class SparseCausalAttention(CrossAttention):
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        clip_length: int = None,
+        SparseCausalAttention_index: list = [-1, 'first']
+    ):
+        if (
+            self.added_kv_proj_dim is not None
+            or encoder_hidden_states is not None
+            or attention_mask is not None
+        ):
+            raise NotImplementedError
+        if self.group_norm is not None:
+            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = self.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = self.reshape_heads_to_batch_dim(query)
+        key = self.to_k(hidden_states)
+        value = self.to_v(hidden_states)
+        if clip_length is not None:
+            key = rearrange(key, "(b f) d c -> b f d c", f=clip_length)
+            value = rearrange(value, "(b f) d c -> b f d c", f=clip_length)
+            #  ***********************Start of SparseCausalAttention_index**********
+            frame_index_list = []
+            # print(f'SparseCausalAttention_index {str(SparseCausalAttention_index)}')
+            if len(SparseCausalAttention_index) > 0:
+                for index in SparseCausalAttention_index:
+                    if isinstance(index, str):
+                        if index == 'first':
+                            frame_index = [0] * clip_length
+                        if index == 'last':
+                            frame_index = [clip_length-1] * clip_length
+                        if (index == 'mid') or (index == 'middle'):
+                            frame_index = [int(clip_length-1)//2] * clip_length
+                    else:
+                        assert isinstance(index, int), 'relative index must be int'
+                        frame_index = torch.arange(clip_length) + index
+                        frame_index = frame_index.clip(0, clip_length-1)
+                    frame_index_list.append(frame_index)
+                key = torch.cat([   key[:, frame_index] for frame_index in frame_index_list
+                                    ], dim=2)
+                value = torch.cat([ value[:, frame_index] for frame_index in frame_index_list
+                                    ], dim=2)
+            #  ***********************End of SparseCausalAttention_index**********
+            key = rearrange(key, "b f d c -> (b f) d c", f=clip_length)
+            value = rearrange(value, "b f d c -> (b f) d c", f=clip_length)
+        key = self.reshape_heads_to_batch_dim(key)
+        value = self.reshape_heads_to_batch_dim(value)
+        # attention, what we cannot get enough of
+        if self._use_memory_efficient_attention_xformers:
+            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
+            # Some versions of xformers return output in fp32, cast it back to the dtype of the input
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
+                hidden_states = self._attention(query, key, value, attention_mask)
+            else:
+                hidden_states = self._sliced_attention(
+                    query, key, value, hidden_states.shape[1], dim, attention_mask
+                )
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        return hidden_states
+# FIXME
+class SparseCausalAttention_fixme(CrossAttention):
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        clip_length: int = None,
+    ):
+        if (
+            self.added_kv_proj_dim is not None
+            or encoder_hidden_states is not None
+            or attention_mask is not None
+        ):
+            raise NotImplementedError
+        if self.group_norm is not None:
+            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = self.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = self.reshape_heads_to_batch_dim(query)
+        key = self.to_k(hidden_states)
+        value = self.to_v(hidden_states)
+        prev_frame_index = torch.arange(clip_length) - 1
+        prev_frame_index[0] = 0
+        key = rearrange(key, "(b f) d c -> b f d c", f=clip_length)
+        key = torch.cat([key[:, [0] * clip_length], key[:, prev_frame_index]], dim=2)
+        key = rearrange(key, "b f d c -> (b f) d c", f=clip_length)
+        value = rearrange(value, "(b f) d c -> b f d c", f=clip_length)
+        value = torch.cat([value[:, [0] * clip_length], value[:, prev_frame_index]], dim=2)
+        value = rearrange(value, "b f d c -> (b f) d c", f=clip_length)
+        key = self.reshape_heads_to_batch_dim(key)
+        value = self.reshape_heads_to_batch_dim(value)
+        if self._use_memory_efficient_attention_xformers:
+            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
+            # Some versions of xformers return output in fp32, cast it back to the dtype of the input
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
+                hidden_states = self._attention(query, key, value, attention_mask)
+            else:
+                hidden_states = self._sliced_attention(
+                    query, key, value, hidden_states.shape[1], dim, attention_mask
+                )
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        return hidden_states