Spaces:

maxin-cn
/

Latte-1

Running on Zero

App Files Files Community

maxin-cn commited on Jul 26, 2024

Commit

94bafa8

verified ·

1 Parent(s): c6fae7d

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
.gitignore +2 -0
LICENSE +201 -0
README.md +167 -12
configs/ffs/ffs_img_train.yaml +45 -0
configs/ffs/ffs_sample.yaml +30 -0
configs/ffs/ffs_train.yaml +42 -0
configs/sky/sky_img_train.yaml +43 -0
configs/sky/sky_sample.yaml +32 -0
configs/sky/sky_train.yaml +42 -0
configs/t2x/t2i_sample.yaml +37 -0
configs/t2x/t2v_sample.yaml +37 -0
configs/taichi/taichi_img_train.yaml +43 -0
configs/taichi/taichi_sample.yaml +30 -0
configs/taichi/taichi_train.yaml +42 -0
configs/ucf101/ucf101_img_train.yaml +44 -0
configs/ucf101/ucf101_sample.yaml +33 -0
configs/ucf101/ucf101_train.yaml +42 -0
datasets/__init__.py +79 -0
datasets/ffs_datasets.py +164 -0
datasets/ffs_image_datasets.py +246 -0
datasets/sky_datasets.py +110 -0
datasets/sky_image_datasets.py +137 -0
datasets/taichi_datasets.py +108 -0
datasets/taichi_image_datasets.py +139 -0
datasets/ucf101_datasets.py +229 -0
datasets/ucf101_image_datasets.py +279 -0
datasets/video_transforms.py +482 -0
demo.py +284 -0
diffusion/__init__.py +47 -0
diffusion/diffusion_utils.py +88 -0
diffusion/gaussian_diffusion.py +881 -0
diffusion/respace.py +130 -0
diffusion/timestep_sampler.py +150 -0
docs/datasets_evaluation.md +53 -0
docs/latte_diffusers.md +106 -0
environment.yml +25 -0
models/__init__.py +52 -0
models/__pycache__/__init__.cpython-312.pyc +0 -0
models/__pycache__/latte.cpython-312.pyc +0 -0
models/__pycache__/latte_img.cpython-312.pyc +0 -0
models/__pycache__/latte_t2v.cpython-312.pyc +0 -0
models/clip.py +126 -0
models/latte.py +526 -0
models/latte_img.py +552 -0
models/latte_t2v.py +945 -0
models/utils.py +215 -0
sample/__pycache__/pipeline_latte.cpython-312.pyc +0 -0
sample/ffs.sh +7 -0
sample/ffs_ddp.sh +7 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+visuals/latte.gif filter=lfs diff=lfs merge=lfs -text
+visuals/latteT2V.gif filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .vscode
2	+ preprocess

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,12 +1,167 @@
----
-title: Latte
-emoji: 🏆
-colorFrom: blue
-colorTo: pink
-sdk: gradio
-sdk_version: 4.39.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Latte
+app_file: demo.py
+sdk: gradio
+sdk_version: 4.37.2
+---
+## Latte: Latent Diffusion Transformer for Video Generation<br><sub>Official PyTorch Implementation</sub>
+<!-- ### [Paper](https://arxiv.org/abs/2401.03048v1) | [Project Page](https://maxin-cn.github.io/latte_project/) -->
+<!-- [![arXiv](https://img.shields.io/badge/arXiv-2401.03048-b31b1b.svg)](https://arxiv.org/abs/2401.03048) -->
+[![Arxiv](https://img.shields.io/badge/Arxiv-b31b1b.svg)](https://arxiv.org/abs/2401.03048)
+[![Project Page](https://img.shields.io/badge/Project-Website-blue)](https://maxin-cn.github.io/latte_project/)
+[![HF Demo](https://img.shields.io/static/v1?label=Demo&message=OpenBayes%E8%B4%9D%E5%BC%8F%E8%AE%A1%E7%AE%97&color=green)](https://openbayes.com/console/public/tutorials/UOeU0ywVxl7)
+[![Static Badge](https://img.shields.io/badge/Latte--1%20checkpoint%20(T2V)-HuggingFace-yellow?logoColor=violet%20Latte-1%20checkpoint)](https://huggingface.co/maxin-cn/Latte-1)
+[![Static Badge](https://img.shields.io/badge/Latte%20checkpoint%20-HuggingFace-yellow?logoColor=violet%20Latte%20checkpoint)](https://huggingface.co/maxin-cn/Latte)
+This repo contains PyTorch model definitions, pre-trained weights, training/sampling code and evaluation code for our paper exploring
+latent diffusion models with transformers (Latte). You can find more visualizations on our [project page](https://maxin-cn.github.io/latte_project/).
+> [**Latte: Latent Diffusion Transformer for Video Generation**](https://maxin-cn.github.io/latte_project/)<br>
+> [Xin Ma](https://maxin-cn.github.io/), [Yaohui Wang*](https://wyhsirius.github.io/), [Xinyuan Chen](https://scholar.google.com/citations?user=3fWSC8YAAAAJ), [Gengyun Jia](https://scholar.google.com/citations?user=_04pkGgAAAAJ&hl=zh-CN), [Ziwei Liu](https://liuziwei7.github.io/), [Yuan-Fang Li](https://users.monash.edu/~yli/), [Cunjian Chen](https://cunjian.github.io/), [Yu Qiao](https://scholar.google.com.hk/citations?user=gFtI-8QAAAAJ&hl=zh-CN)
+> (*Corresponding Author & Project Lead)
+<!-- > <br>Monash University, Shanghai Artificial Intelligence Laboratory,<br> NJUPT, S-Lab, Nanyang Technological University
+We propose a novel Latent Diffusion Transformer, namely Latte, for video generation. Latte first extracts spatio-temporal tokens from input videos and then adopts a series of Transformer blocks to model video distribution in the latent space. In order to model a substantial number of tokens extracted from videos, four efficient variants are introduced from the perspective of decomposing the spatial and temporal dimensions of input videos. To improve the quality of generated videos, we determine the best practices of Latte through rigorous experimental analysis, including video clip patch embedding, model variants, timestep-class information injection, temporal positional embedding, and learning strategies. Our comprehensive evaluation demonstrates that Latte achieves state-of-the-art performance across four standard video generation datasets, i.e., FaceForensics, SkyTimelapse, UCF101, and Taichi-HD. In addition, we extend Latte to text-to-video generation (T2V) task, where Latte achieves comparable results compared to recent T2V models. We strongly believe that Latte provides valuable insights for future research on incorporating Transformers into diffusion models for video generation.
+ ![The architecture of Latte](visuals/architecture.svg){width=20}
+ -->
+<!--
+<div align="center">
+    <img src="visuals/architecture.svg" width="650">
+</div>
+This repository contains:
+* 🪐 A simple PyTorch [implementation](models/latte.py) of Latte
+* ⚡️ **Pre-trained Latte models** trained on FaceForensics, SkyTimelapse, Taichi-HD and UCF101 (256x256). In addition, we provide a T2V checkpoint (512x512). All checkpoints can be found [here](https://huggingface.co/maxin-cn/Latte/tree/main).
+* 🛸 A Latte [training script](train.py) using PyTorch DDP.
+-->
+<video controls loop src="https://github.com/Vchitect/Latte/assets/7929326/a650cd84-2378-4303-822b-56a441e1733b" type="video/mp4"></video>
+## News
+- (🔥 New) **Jul 11, 2024** 💥 **Latte-1 is now integrated into [diffusers](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformers/latte_transformer_3d.py). Thanks to [@yiyixuxu](https://github.com/yiyixuxu), [@sayakpaul](https://github.com/sayakpaul), [@a-r-r-o-w](https://github.com/a-r-r-o-w) and [@DN6](https://github.com/DN6).** You can easily run Latte using the following code. We also support inference with 4/8-bit quantization, which can reduce GPU memory from 17 GB to 9 GB. Please refer to this [tutorial](docs/latte_diffusers.md) for more information.
+```
+from diffusers import LattePipeline
+from diffusers.models import AutoencoderKLTemporalDecoder
+from torchvision.utils import save_image
+import torch
+import imageio
+torch.manual_seed(0)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+video_length = 16 # 1 (text-to-image) or 16 (text-to-video)
+pipe = LattePipeline.from_pretrained("maxin-cn/Latte-1", torch_dtype=torch.float16).to(device)
+# Using temporal decoder of VAE
+vae = AutoencoderKLTemporalDecoder.from_pretrained("maxin-cn/Latte-1", subfolder="vae_temporal_decoder", torch_dtype=torch.float16).to(device)
+pipe.vae = vae
+prompt = "a cat wearing sunglasses and working as a lifeguard at pool."
+videos = pipe(prompt, video_length=video_length, output_type='pt').frames.cpu()
+```
+- (🔥 New) **May 23, 2024** 💥 **Latte-1** is released! Pre-trained model can be downloaded [here](https://huggingface.co/maxin-cn/Latte-1/tree/main/transformer). **We support both T2V and T2I**. Please run `bash sample/t2v.sh` and `bash sample/t2i.sh` respectively.
+<!--
+<div align="center">
+    <img src="visuals/latteT2V.gif" width=88%>
+</div>
+-->
+- (🔥 New) **Feb 24, 2024** 💥 We are very grateful that researchers and developers like our work. We will continue to update our LatteT2V model, hoping that our efforts can help the community develop. Our Latte discord channel <a href="https://discord.gg/RguYqhVU92" style="text-decoration:none;">
+<img src="https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png" width="3%" alt="" /></a> is created for discussions. Coders are welcome to contribute.
+- (🔥 New) **Jan 9, 2024** 💥 An updated LatteT2V model initialized with the [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha) is released, the checkpoint can be found [here](https://huggingface.co/maxin-cn/Latte-0/tree/main/transformer).
+- (🔥 New) **Oct 31, 2023** 💥 The training and inference code is released. All checkpoints (including FaceForensics, SkyTimelapse, UCF101, and Taichi-HD) can be found [here](https://huggingface.co/maxin-cn/Latte/tree/main). In addition, the LatteT2V inference code is provided.
+## Setup
+First, download and set up the repo:
+```bash
+git clone https://github.com/Vchitect/Latte
+cd Latte
+```
+We provide an [`environment.yml`](environment.yml) file that can be used to create a Conda environment. If you only want
+to run pre-trained models locally on CPU, you can remove the `cudatoolkit` and `pytorch-cuda` requirements from the file.
+```bash
+conda env create -f environment.yml
+conda activate latte
+```
+## Sampling
+You can sample from our **pre-trained Latte models** with [`sample.py`](sample/sample.py). Weights for our pre-trained Latte model can be found [here](https://huggingface.co/maxin-cn/Latte).  The script has various arguments to adjust sampling steps, change the classifier-free guidance scale, etc. For example, to sample from our model on FaceForensics, you can use:
+```bash
+bash sample/ffs.sh
+```
+or if you want to sample hundreds of videos, you can use the following script with Pytorch DDP:
+```bash
+bash sample/ffs_ddp.sh
+```
+If you want to try generating videos from text, just run `bash sample/t2v.sh`. All related checkpoints will download automatically.
+If you would like to measure the quantitative metrics of your generated results, please refer to [here](docs/datasets_evaluation.md).
+## Training
+We provide a training script for Latte in [`train.py`](train.py). The structure of the datasets can be found [here](docs/datasets_evaluation.md). This script can be used to train class-conditional and unconditional
+Latte models. To launch Latte (256x256) training with `N` GPUs on the FaceForensics dataset
+:
+```bash
+torchrun --nnodes=1 --nproc_per_node=N train.py --config ./configs/ffs/ffs_train.yaml
+```
+or If you have a cluster that uses slurm, you can also train Latte's model using the following scripts:
+ ```bash
+sbatch slurm_scripts/ffs.slurm
+```
+We also provide the video-image joint training scripts [`train_with_img.py`](train_with_img.py). Similar to [`train.py`](train.py) scripts, these scripts can be also used to train class-conditional and unconditional
+Latte models. For example, if you want to train the Latte model on the FaceForensics dataset, you can use:
+```bash
+torchrun --nnodes=1 --nproc_per_node=N train_with_img.py --config ./configs/ffs/ffs_img_train.yaml
+```
+## Contact Us
+**Yaohui Wang**: [wangyaohui@pjlab.org.cn](mailto:wangyaohui@pjlab.org.cn)
+**Xin Ma**: [xin.ma1@monash.edu](mailto:xin.ma1@monash.edu)
+## Citation
+If you find this work useful for your research, please consider citing it.
+```bibtex
+@article{ma2024latte,
+  title={Latte: Latent Diffusion Transformer for Video Generation},
+  author={Ma, Xin and Wang, Yaohui and Jia, Gengyun and Chen, Xinyuan and Liu, Ziwei and Li, Yuan-Fang and Chen, Cunjian and Qiao, Yu},
+  journal={arXiv preprint arXiv:2401.03048},
+  year={2024}
+}
+```
+## Acknowledgments
+Latte has been greatly inspired by the following amazing works and teams: [DiT](https://github.com/facebookresearch/DiT) and [PixArt-α](https://github.com/PixArt-alpha/PixArt-alpha), we thank all the contributors for open-sourcing.
+## License
+The code and model weights are licensed under [LICENSE](LICENSE).

configs/ffs/ffs_img_train.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+# dataset
+dataset: "ffs_img"
+data_path: "/path/to/datasets/preprocessed_ffs/train/videos/"
+frame_data_path: "/path/to/datasets/preprocessed_ffs/train/images/"
+frame_data_txt: "/path/to/datasets/preprocessed_ffs/train_list.txt"
+pretrained_model_path: "/path/to/pretrained/Latte/"
+# save and load
+results_dir: "./results_img"
+pretrained:
+# model config:
+model: LatteIMG-XL/2
+num_frames: 16
+image_size: 256 # choices=[256, 512]
+num_sampling_steps: 250
+frame_interval: 3
+fixed_spatial: False
+attention_bias: True
+learn_sigma: True # important
+extras: 1 # [1, 2] 1 unconditional generation, 2 class-conditional generation
+# train config:
+save_ceph: True # important
+use_image_num: 8
+learning_rate: 1e-4
+ckpt_every: 10000
+clip_max_norm: 0.1
+start_clip_iter: 500000
+local_batch_size: 4 # important
+max_train_steps: 1000000
+global_seed: 3407
+num_workers: 8
+log_every: 100
+lr_warmup_steps: 0
+resume_from_checkpoint:
+gradient_accumulation_steps: 1 # TODO
+num_classes:
+# low VRAM and speed up training
+use_compile: False
+mixed_precision: False
+enable_xformers_memory_efficient_attention: False
+gradient_checkpointing: False

configs/ffs/ffs_sample.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+# path:
+ckpt: # will be overwrite
+save_img_path: "./sample_videos" # will be overwrite
+pretrained_model_path: "/path/to/pretrained/Latte/"
+# model config:
+model: Latte-XL/2
+num_frames: 16
+image_size: 256 # choices=[256, 512]
+frame_interval: 2
+fixed_spatial: False
+attention_bias: True
+learn_sigma: True
+extras: 1 # [1, 2] 1 unconditional generation, 2 class-conditional generation
+num_classes:
+# model speedup
+use_compile: False
+use_fp16: True
+# sample config:
+seed:
+sample_method: 'ddpm'
+num_sampling_steps: 250
+cfg_scale: 1.0
+negative_name:
+# ddp sample config
+per_proc_batch_size: 2
+num_fvd_samples: 2048

configs/ffs/ffs_train.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+# dataset
+dataset: "ffs"
+data_path: "/path/to/datasets/preprocess_ffs/train/videos/" # s
+pretrained_model_path: "/path/to/pretrained/Latte/"
+# save and load
+results_dir: "./results"
+pretrained:
+# model config:
+model: Latte-XL/2
+num_frames: 16
+image_size: 256 # choices=[256, 512]
+num_sampling_steps: 250
+frame_interval: 3
+fixed_spatial: False
+attention_bias: True
+learn_sigma: True # important
+extras: 1 # [1, 2] 1 unconditional generation, 2 class-conditional generation
+# train config:
+save_ceph: True # important
+learning_rate: 1e-4
+ckpt_every: 10000
+clip_max_norm: 0.1
+start_clip_iter: 20000
+local_batch_size: 5 # important
+max_train_steps: 1000000
+global_seed: 3407
+num_workers: 8
+log_every: 100
+lr_warmup_steps: 0
+resume_from_checkpoint:
+gradient_accumulation_steps: 1 # TODO
+num_classes:
+# low VRAM and speed up training
+use_compile: False
+mixed_precision: False
+enable_xformers_memory_efficient_attention: False
+gradient_checkpointing: False

configs/sky/sky_img_train.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+# dataset
+dataset: "sky_img"
+data_path: "/path/to/datasets/sky_timelapse/sky_train/" # s/p
+pretrained_model_path: "/path/to/pretrained/Latte/"
+# save and load
+results_dir: "./results_img"
+pretrained:
+# model config:
+model: LatteIMG-XL/2
+num_frames: 16
+image_size: 256 # choices=[256, 512]
+num_sampling_steps: 250
+frame_interval: 3
+fixed_spatial: False
+attention_bias: True
+learn_sigma: True
+extras: 1 # [1, 2] 1 unconditional generation, 2 class-conditional generation
+# train config:
+save_ceph: True # important
+use_image_num: 8 # important
+learning_rate: 1e-4
+ckpt_every: 10000
+clip_max_norm: 0.1
+start_clip_iter: 20000
+local_batch_size: 4 # important
+max_train_steps: 1000000
+global_seed: 3407
+num_workers: 8
+log_every: 50
+lr_warmup_steps: 0
+resume_from_checkpoint:
+gradient_accumulation_steps: 1 # TODO
+num_classes:
+# low VRAM and speed up training
+use_compile: False
+mixed_precision: False
+enable_xformers_memory_efficient_attention: False
+gradient_checkpointing: False

configs/sky/sky_sample.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+# path:
+ckpt: # will be overwrite
+save_img_path: "./sample_videos/" # will be overwrite
+pretrained_model_path: "/path/to/pretrained/Latte/"
+# model config:
+model: Latte-XL/2
+num_frames: 16
+image_size: 256 # choices=[256, 512]
+frame_interval: 2
+fixed_spatial: False
+attention_bias: True
+learn_sigma: True
+extras: 1 # [1, 2] 1 unconditional generation, 2 class-conditional generation
+num_classes:
+# model speedup
+use_compile: False
+use_fp16: True
+# sample config:
+seed:
+sample_method: 'ddpm'
+num_sampling_steps: 250
+cfg_scale: 1.0
+run_time: 12
+num_sample: 1
+negative_name:
+# ddp sample config
+per_proc_batch_size: 1
+num_fvd_samples: 2

configs/sky/sky_train.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+# dataset
+dataset: "sky"
+data_path: "/path/to/datasets/sky_timelapse/sky_train/"
+pretrained_model_path: "/path/to/pretrained/Latte/"
+# save and load
+results_dir: "./results"
+pretrained:
+# model config:
+model: Latte-XL/2
+num_frames: 16
+image_size: 256 # choices=[256, 512]
+num_sampling_steps: 250
+frame_interval: 3
+fixed_spatial: False
+attention_bias: True
+learn_sigma: True
+extras: 1 # [1, 2] 1 unconditional generation, 2 class-conditional generation
+# train config:
+save_ceph: True # important
+learning_rate: 1e-4
+ckpt_every: 10000
+clip_max_norm: 0.1
+start_clip_iter: 20000
+local_batch_size: 5 # important
+max_train_steps: 1000000
+global_seed: 3407
+num_workers: 8
+log_every: 50
+lr_warmup_steps: 0
+resume_from_checkpoint:
+gradient_accumulation_steps: 1 # TODO
+num_classes:
+# low VRAM and speed up training
+use_compile: False
+mixed_precision: False
+enable_xformers_memory_efficient_attention: False
+gradient_checkpointing: False

configs/t2x/t2i_sample.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# path:
+save_img_path: "./sample_videos/t2i-"
+pretrained_model_path: "maxin-cn/Latte-1"
+# model config:
+# maxin-cn/Latte-0: the first released version
+# maxin-cn/Latte-1: the second version with better performance (released on May. 23, 2024)
+model: LatteT2V
+video_length: 1
+image_size: [512, 512]
+# # beta schedule
+beta_start: 0.0001
+beta_end: 0.02
+beta_schedule: "linear"
+variance_type: "learned_range"
+# model speedup
+use_compile: False
+use_fp16: True
+# sample config:
+seed:
+run_time: 0
+guidance_scale: 7.5
+sample_method: 'DDIM'
+num_sampling_steps: 50
+enable_temporal_attentions: True # LatteT2V-V0: set to False; LatteT2V-V1: set to True
+enable_vae_temporal_decoder: False
+text_prompt: [
+              'Yellow and black tropical fish dart through the sea.',
+              'An epic tornado attacking above aglowing city at night.',
+              'Slow pan upward of blazing oak fire in an indoor fireplace.',
+              'a cat wearing sunglasses and working as a lifeguard at pool.',
+              'Sunset over the sea.',
+              'A dog in astronaut suit and sunglasses floating in space.',
+              ]

configs/t2x/t2v_sample.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# path:
+save_img_path: "./sample_videos/t2v-"
+pretrained_model_path: "/data/monash_vidgen/pretrained/Latte-1"
+# model config:
+# maxin-cn/Latte-0: the first released version
+# maxin-cn/Latte-1: the second version with better performance (released on May. 23, 2024)
+model: LatteT2V
+video_length: 16
+image_size: [512, 512]
+# # beta schedule
+beta_start: 0.0001
+beta_end: 0.02
+beta_schedule: "linear"
+variance_type: "learned_range"
+# model speedup
+use_compile: False
+use_fp16: True
+# sample config:
+seed: 0
+run_time: 0
+guidance_scale: 7.5
+sample_method: 'DDIM'
+num_sampling_steps: 50
+enable_temporal_attentions: True
+enable_vae_temporal_decoder: True # use temporal vae decoder from SVD, maybe reduce the video flicker (It's not widely tested)
+text_prompt: [
+              'Yellow and black tropical fish dart through the sea.',
+              'An epic tornado attacking above aglowing city at night.',
+              'Slow pan upward of blazing oak fire in an indoor fireplace.',
+              'a cat wearing sunglasses and working as a lifeguard at pool.',
+              'Sunset over the sea.',
+              'A dog in astronaut suit and sunglasses floating in space.',
+              ]

configs/taichi/taichi_img_train.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+# dataset
+dataset: "taichi_img"
+data_path: "/path/to/datasets/taichi"
+pretrained_model_path: "/path/to/pretrained/Latte/"
+# save and load
+results_dir: "./results_img"
+pretrained:
+# model config:
+model: LatteIMG-XL/2
+num_frames: 16
+image_size: 256 # choices=[256, 512]
+num_sampling_steps: 250
+frame_interval: 3
+fixed_spatial: False
+attention_bias: True
+learn_sigma: True
+extras: 1 # [1, 2] 1 unconditional generation, 2 class-conditional generation
+# train config:
+load_from_ceph: False # important
+use_image_num: 8
+learning_rate: 1e-4
+ckpt_every: 10000
+clip_max_norm: 0.1
+start_clip_iter: 500000
+local_batch_size: 4 # important
+max_train_steps: 1000000
+global_seed: 3407
+num_workers: 8
+log_every: 50
+lr_warmup_steps: 0
+resume_from_checkpoint:
+gradient_accumulation_steps: 1 # TODO
+num_classes:
+# low VRAM and speed up training TODO
+use_compile: False
+mixed_precision: False
+enable_xformers_memory_efficient_attention: False
+gradient_checkpointing: False

configs/taichi/taichi_sample.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+# path:
+ckpt: # will be overwrite
+save_img_path: "./sample_videos/" # will be overwrite
+pretrained_model_path: "/path/to/pretrained/Latte/"
+# model config:
+model: Latte-XL/2
+num_frames: 16
+image_size: 256 # choices=[256, 512]
+frame_interval: 2
+fixed_spatial: False
+attention_bias: True
+learn_sigma: True
+extras: 1 # [1, 2] 1 unconditional generation, 2 class-conditional generation
+num_classes:
+# model speedup
+use_compile: False
+use_fp16: True
+# sample config:
+seed:
+sample_method: 'ddpm'
+num_sampling_steps: 250
+cfg_scale: 1.0
+negative_name:
+# ddp sample config
+per_proc_batch_size: 1
+num_fvd_samples: 2

configs/taichi/taichi_train.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+# dataset
+dataset: "taichi"
+data_path: "/path/to/datasets/taichi"
+pretrained_model_path: "/path/to/pretrained/Latte/"
+# save and load
+results_dir: "./results"
+pretrained:
+# model config:
+model: Latte-XL/2
+num_frames: 16
+image_size: 256 # choices=[256, 512]
+num_sampling_steps: 250
+frame_interval: 3
+fixed_spatial: False
+attention_bias: True
+learn_sigma: True
+extras: 1 # [1, 2] 1 unconditional generation, 2 class-conditional generation
+# train config:
+load_from_ceph: False # important
+learning_rate: 1e-4
+ckpt_every: 10000
+clip_max_norm: 0.1
+start_clip_iter: 500000
+local_batch_size: 5 # important
+max_train_steps: 1000000
+global_seed: 3407
+num_workers: 8
+log_every: 50
+lr_warmup_steps: 0
+resume_from_checkpoint:
+gradient_accumulation_steps: 1 # TODO
+num_classes:
+# low VRAM and speed up training TODO
+use_compile: False
+mixed_precision: False
+enable_xformers_memory_efficient_attention: False
+gradient_checkpointing: False

configs/ucf101/ucf101_img_train.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+# dataset
+dataset: "ucf101_img"
+data_path: "/path/to/datasets/UCF101/videos/"
+frame_data_txt: "/path/to/datasets/UCF101/train_256_list.txt"
+pretrained_model_path: "/path/to/pretrained/Latte/"
+# save and load
+results_dir: "./results_img"
+pretrained:
+# model config:
+model: LatteIMG-XL/2
+num_frames: 16
+image_size: 256 # choices=[256, 512]
+num_sampling_steps: 250
+frame_interval: 3
+fixed_spatial: False
+attention_bias: True
+learn_sigma: True
+extras: 2 # [1, 2] 1 unconditional generation, 2 class-conditional generation
+# train config:
+save_ceph: True # important
+use_image_num: 8 # important
+learning_rate: 1e-4
+ckpt_every: 10000
+clip_max_norm: 0.1
+start_clip_iter: 100000
+local_batch_size: 4 # important
+max_train_steps: 1000000
+global_seed: 3407
+num_workers: 8
+log_every: 50
+lr_warmup_steps: 0
+resume_from_checkpoint:
+gradient_accumulation_steps: 1 # TODO
+num_classes: 101
+# low VRAM and speed up training
+use_compile: False
+mixed_precision: False
+enable_xformers_memory_efficient_attention: False
+gradient_checkpointing: False

configs/ucf101/ucf101_sample.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+# path:
+ckpt:
+save_img_path: "./sample_videos/"
+pretrained_model_path: "/path/to/pretrained/Latte/"
+# model config:
+model: Latte-XL/2
+num_frames: 16
+image_size: 256 # choices=[256, 512]
+frame_interval: 3
+fixed_spatial: False
+attention_bias: True
+learn_sigma: True
+extras: 2 # [1, 2] 1 unconditional generation, 2 class-conditional generation
+num_classes: 101
+# model speedup
+use_compile: False
+use_fp16: True
+# sample config:
+seed:
+sample_method: 'ddpm'
+num_sampling_steps: 250
+cfg_scale: 7.0
+run_time: 12
+num_sample: 1
+sample_names: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+negative_name: 101
+# ddp sample config
+per_proc_batch_size: 2
+num_fvd_samples: 2

configs/ucf101/ucf101_train.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+# dataset
+dataset: "ucf101"
+data_path: "/path/to/datasets/UCF101/videos/"
+pretrained_model_path: "/path/to/pretrained/Latte/"
+# save and load
+results_dir: "./results"
+pretrained:
+# model config:
+model: Latte-XL/2
+num_frames: 16
+image_size: 256 # choices=[256, 512]
+num_sampling_steps: 250
+frame_interval: 3
+fixed_spatial: False
+attention_bias: True
+learn_sigma: True
+extras: 2 # [1, 2] 1 unconditional generation, 2 class-conditional generation
+# train config:
+save_ceph: True # important
+learning_rate: 1e-4
+ckpt_every: 10000
+clip_max_norm: 0.1
+start_clip_iter: 100000
+local_batch_size: 5 # important
+max_train_steps: 1000000
+global_seed: 3407
+num_workers: 8
+log_every: 50
+lr_warmup_steps: 0
+resume_from_checkpoint:
+gradient_accumulation_steps: 1 # TODO
+num_classes: 101
+# low VRAM and speed up training
+use_compile: False
+mixed_precision: False
+enable_xformers_memory_efficient_attention: False
+gradient_checkpointing: False

datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from .sky_datasets import Sky
+from torchvision import transforms
+from .taichi_datasets import Taichi
+from datasets import video_transforms
+from .ucf101_datasets import UCF101
+from .ffs_datasets import FaceForensics
+from .ffs_image_datasets import FaceForensicsImages
+from .sky_image_datasets import SkyImages
+from .ucf101_image_datasets import UCF101Images
+from .taichi_image_datasets import TaichiImages
+def get_dataset(args):
+    temporal_sample = video_transforms.TemporalRandomCrop(args.num_frames * args.frame_interval) # 16 1
+    if args.dataset == 'ffs':
+        transform_ffs = transforms.Compose([
+            video_transforms.ToTensorVideo(), # TCHW
+            video_transforms.RandomHorizontalFlipVideo(),
+            video_transforms.UCFCenterCropVideo(args.image_size),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+        ])
+        return FaceForensics(args, transform=transform_ffs, temporal_sample=temporal_sample)
+    elif args.dataset == 'ffs_img':
+        transform_ffs = transforms.Compose([
+            video_transforms.ToTensorVideo(), # TCHW
+            video_transforms.RandomHorizontalFlipVideo(),
+            video_transforms.UCFCenterCropVideo(args.image_size),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+        ])
+        return FaceForensicsImages(args, transform=transform_ffs, temporal_sample=temporal_sample)
+    elif args.dataset == 'ucf101':
+        transform_ucf101 = transforms.Compose([
+            video_transforms.ToTensorVideo(), # TCHW
+            video_transforms.RandomHorizontalFlipVideo(),
+            video_transforms.UCFCenterCropVideo(args.image_size),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+        ])
+        return UCF101(args, transform=transform_ucf101, temporal_sample=temporal_sample)
+    elif args.dataset == 'ucf101_img':
+        transform_ucf101 = transforms.Compose([
+            video_transforms.ToTensorVideo(), # TCHW
+            video_transforms.RandomHorizontalFlipVideo(),
+            video_transforms.UCFCenterCropVideo(args.image_size),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+        ])
+        return UCF101Images(args, transform=transform_ucf101, temporal_sample=temporal_sample)
+    elif args.dataset == 'taichi':
+        transform_taichi = transforms.Compose([
+            video_transforms.ToTensorVideo(), # TCHW
+            video_transforms.RandomHorizontalFlipVideo(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+        ])
+        return Taichi(args, transform=transform_taichi, temporal_sample=temporal_sample)
+    elif args.dataset == 'taichi_img':
+        transform_taichi = transforms.Compose([
+            video_transforms.ToTensorVideo(), # TCHW
+            video_transforms.RandomHorizontalFlipVideo(),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+        ])
+        return TaichiImages(args, transform=transform_taichi, temporal_sample=temporal_sample)
+    elif args.dataset == 'sky':
+        transform_sky = transforms.Compose([
+                    video_transforms.ToTensorVideo(),
+                    video_transforms.CenterCropResizeVideo(args.image_size),
+                    # video_transforms.RandomHorizontalFlipVideo(),
+                    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+            ])
+        return Sky(args, transform=transform_sky, temporal_sample=temporal_sample)
+    elif args.dataset == 'sky_img':
+        transform_sky = transforms.Compose([
+                    video_transforms.ToTensorVideo(),
+                    video_transforms.CenterCropResizeVideo(args.image_size),
+                    # video_transforms.RandomHorizontalFlipVideo(),
+                    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+            ])
+        return SkyImages(args, transform=transform_sky, temporal_sample=temporal_sample)
+    else:
+        raise NotImplementedError(args.dataset)

datasets/ffs_datasets.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import os
+import json
+import torch
+import decord
+import torchvision
+import numpy as np
+from PIL import Image
+from einops import rearrange
+from typing import Dict, List, Tuple
+class_labels_map = None
+cls_sample_cnt = None
+def temporal_sampling(frames, start_idx, end_idx, num_samples):
+    """
+    Given the start and end frame index, sample num_samples frames between
+    the start and end with equal interval.
+    Args:
+        frames (tensor): a tensor of video frames, dimension is
+            `num video frames` x `channel` x `height` x `width`.
+        start_idx (int): the index of the start frame.
+        end_idx (int): the index of the end frame.
+        num_samples (int): number of frames to sample.
+    Returns:
+        frames (tersor): a tensor of temporal sampled video frames, dimension is
+            `num clip frames` x `channel` x `height` x `width`.
+    """
+    index = torch.linspace(start_idx, end_idx, num_samples)
+    index = torch.clamp(index, 0, frames.shape[0] - 1).long()
+    frames = torch.index_select(frames, 0, index)
+    return frames
+def numpy2tensor(x):
+    return torch.from_numpy(x)
+def get_filelist(file_path):
+    Filelist = []
+    for home, dirs, files in os.walk(file_path):
+        for filename in files:
+            Filelist.append(os.path.join(home, filename))
+            # Filelist.append( filename)
+    return Filelist
+def load_annotation_data(data_file_path):
+    with open(data_file_path, 'r') as data_file:
+        return json.load(data_file)
+def get_class_labels(num_class, anno_pth='./k400_classmap.json'):
+    global class_labels_map, cls_sample_cnt
+    if class_labels_map is not None:
+        return class_labels_map, cls_sample_cnt
+    else:
+        cls_sample_cnt = {}
+        class_labels_map = load_annotation_data(anno_pth)
+        for cls in class_labels_map:
+            cls_sample_cnt[cls] = 0
+        return class_labels_map, cls_sample_cnt
+def load_annotations(ann_file, num_class, num_samples_per_cls):
+    dataset = []
+    class_to_idx, cls_sample_cnt = get_class_labels(num_class)
+    with open(ann_file, 'r') as fin:
+        for line in fin:
+            line_split = line.strip().split('\t')
+            sample = {}
+            idx = 0
+            # idx for frame_dir
+            frame_dir = line_split[idx]
+            sample['video'] = frame_dir
+            idx += 1
+            # idx for label[s]
+            label = [x for x in line_split[idx:]]
+            assert label, f'missing label in line: {line}'
+            assert len(label) == 1
+            class_name = label[0]
+            class_index = int(class_to_idx[class_name])
+            # choose a class subset of whole dataset
+            if class_index < num_class:
+                sample['label'] = class_index
+                if cls_sample_cnt[class_name] < num_samples_per_cls:
+                    dataset.append(sample)
+                    cls_sample_cnt[class_name]+=1
+    return dataset
+class DecordInit(object):
+    """Using Decord(https://github.com/dmlc/decord) to initialize the video_reader."""
+    def __init__(self, num_threads=1, **kwargs):
+        self.num_threads = num_threads
+        self.ctx = decord.cpu(0)
+        self.kwargs = kwargs
+    def __call__(self, filename):
+        """Perform the Decord initialization.
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        reader = decord.VideoReader(filename,
+                                    ctx=self.ctx,
+                                    num_threads=self.num_threads)
+        return reader
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'sr={self.sr},'
+                    f'num_threads={self.num_threads})')
+        return repr_str
+class FaceForensics(torch.utils.data.Dataset):
+    """Load the FaceForensics video files
+    Args:
+        target_video_len (int): the number of video frames will be load.
+        align_transform (callable): Align different videos in a specified size.
+        temporal_sample (callable): Sample the target length of a video.
+    """
+    def __init__(self,
+                 configs,
+                 transform=None,
+                 temporal_sample=None):
+        self.configs = configs
+        self.data_path = configs.data_path
+        self.video_lists = get_filelist(configs.data_path)
+        self.transform = transform
+        self.temporal_sample = temporal_sample
+        self.target_video_len = self.configs.num_frames
+        self.v_decoder = DecordInit()
+    def __getitem__(self, index):
+        path = self.video_lists[index]
+        vframes, aframes, info = torchvision.io.read_video(filename=path, pts_unit='sec', output_format='TCHW')
+        total_frames = len(vframes)
+        # Sampling video frames
+        start_frame_ind, end_frame_ind = self.temporal_sample(total_frames)
+        assert end_frame_ind - start_frame_ind >= self.target_video_len
+        frame_indice = np.linspace(start_frame_ind, end_frame_ind-1, self.target_video_len, dtype=int)
+        video = vframes[frame_indice]
+        # videotransformer data proprecess
+        video = self.transform(video) # T C H W
+        return {'video': video, 'video_name': 1}
+    def __len__(self):
+        return len(self.video_lists)
+if __name__ == '__main__':
+    pass

datasets/ffs_image_datasets.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import os
+import json
+import torch
+import decord
+import torchvision
+import numpy as np
+import random
+from PIL import Image
+from einops import rearrange
+from typing import Dict, List, Tuple
+from torchvision import transforms
+import traceback
+class_labels_map = None
+cls_sample_cnt = None
+def temporal_sampling(frames, start_idx, end_idx, num_samples):
+    """
+    Given the start and end frame index, sample num_samples frames between
+    the start and end with equal interval.
+    Args:
+        frames (tensor): a tensor of video frames, dimension is
+            `num video frames` x `channel` x `height` x `width`.
+        start_idx (int): the index of the start frame.
+        end_idx (int): the index of the end frame.
+        num_samples (int): number of frames to sample.
+    Returns:
+        frames (tersor): a tensor of temporal sampled video frames, dimension is
+            `num clip frames` x `channel` x `height` x `width`.
+    """
+    index = torch.linspace(start_idx, end_idx, num_samples)
+    index = torch.clamp(index, 0, frames.shape[0] - 1).long()
+    frames = torch.index_select(frames, 0, index)
+    return frames
+def numpy2tensor(x):
+    return torch.from_numpy(x)
+def get_filelist(file_path):
+    Filelist = []
+    for home, dirs, files in os.walk(file_path):
+        for filename in files:
+            # 文件名列表，包含完整路径
+            Filelist.append(os.path.join(home, filename))
+            # # 文件名列表，只包含文件名
+            # Filelist.append( filename)
+    return Filelist
+def load_annotation_data(data_file_path):
+    with open(data_file_path, 'r') as data_file:
+        return json.load(data_file)
+def get_class_labels(num_class, anno_pth='./k400_classmap.json'):
+    global class_labels_map, cls_sample_cnt
+    if class_labels_map is not None:
+        return class_labels_map, cls_sample_cnt
+    else:
+        cls_sample_cnt = {}
+        class_labels_map = load_annotation_data(anno_pth)
+        for cls in class_labels_map:
+            cls_sample_cnt[cls] = 0
+        return class_labels_map, cls_sample_cnt
+def load_annotations(ann_file, num_class, num_samples_per_cls):
+    dataset = []
+    class_to_idx, cls_sample_cnt = get_class_labels(num_class)
+    with open(ann_file, 'r') as fin:
+        for line in fin:
+            line_split = line.strip().split('\t')
+            sample = {}
+            idx = 0
+            # idx for frame_dir
+            frame_dir = line_split[idx]
+            sample['video'] = frame_dir
+            idx += 1
+            # idx for label[s]
+            label = [x for x in line_split[idx:]]
+            assert label, f'missing label in line: {line}'
+            assert len(label) == 1
+            class_name = label[0]
+            class_index = int(class_to_idx[class_name])
+            # choose a class subset of whole dataset
+            if class_index < num_class:
+                sample['label'] = class_index
+                if cls_sample_cnt[class_name] < num_samples_per_cls:
+                    dataset.append(sample)
+                    cls_sample_cnt[class_name]+=1
+    return dataset
+class DecordInit(object):
+    """Using Decord(https://github.com/dmlc/decord) to initialize the video_reader."""
+    def __init__(self, num_threads=1, **kwargs):
+        self.num_threads = num_threads
+        self.ctx = decord.cpu(0)
+        self.kwargs = kwargs
+    def __call__(self, filename):
+        """Perform the Decord initialization.
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        reader = decord.VideoReader(filename,
+                                    ctx=self.ctx,
+                                    num_threads=self.num_threads)
+        return reader
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'sr={self.sr},'
+                    f'num_threads={self.num_threads})')
+        return repr_str
+class FaceForensicsImages(torch.utils.data.Dataset):
+    """Load the FaceForensics video files
+    Args:
+        target_video_len (int): the number of video frames will be load.
+        align_transform (callable): Align different videos in a specified size.
+        temporal_sample (callable): Sample the target length of a video.
+    """
+    def __init__(self,
+                 configs,
+                 transform=None,
+                 temporal_sample=None):
+        self.configs = configs
+        self.data_path = configs.data_path
+        self.video_lists = get_filelist(configs.data_path)
+        self.transform = transform
+        self.temporal_sample = temporal_sample
+        self.target_video_len = self.configs.num_frames
+        self.v_decoder = DecordInit()
+        self.video_length = len(self.video_lists)
+        # ffs video frames
+        self.video_frame_path = configs.frame_data_path
+        self.video_frame_txt = configs.frame_data_txt
+        self.video_frame_files = [frame_file.strip() for frame_file in open(self.video_frame_txt)]
+        random.shuffle(self.video_frame_files)
+        self.use_image_num = configs.use_image_num
+        self.image_tranform = transforms.Compose([
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+        ])
+    def __getitem__(self, index):
+        video_index = index % self.video_length
+        path = self.video_lists[video_index]
+        vframes, aframes, info = torchvision.io.read_video(filename=path, pts_unit='sec', output_format='TCHW')
+        total_frames = len(vframes)
+        # Sampling video frames
+        start_frame_ind, end_frame_ind = self.temporal_sample(total_frames)
+        assert end_frame_ind - start_frame_ind >= self.target_video_len
+        frame_indice = np.linspace(start_frame_ind, end_frame_ind-1, self.target_video_len, dtype=int)
+        video = vframes[frame_indice]
+        # videotransformer data proprecess
+        video = self.transform(video) # T C H W
+        # get video frames
+        images = []
+        for i in range(self.use_image_num):
+            while True:
+                try:
+                    image = Image.open(os.path.join(self.video_frame_path, self.video_frame_files[index+i])).convert("RGB")
+                    image = self.image_tranform(image).unsqueeze(0)
+                    images.append(image)
+                    break
+                except Exception as e:
+                    traceback.print_exc()
+                    index = random.randint(0, len(self.video_frame_files) - self.use_image_num)
+        images =  torch.cat(images, dim=0)
+        assert len(images) == self.use_image_num
+        video_cat = torch.cat([video, images], dim=0)
+        return {'video': video_cat, 'video_name': 1}
+    def __len__(self):
+        return len(self.video_frame_files)
+if __name__ == '__main__':
+    import argparse
+    import torchvision
+    import video_transforms
+    import torch.utils.data as Data
+    import torchvision.transforms as transform
+    from PIL import Image
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num_frames", type=int, default=16)
+    parser.add_argument("--use-image-num", type=int, default=5)
+    parser.add_argument("--frame_interval", type=int, default=3)
+    parser.add_argument("--dataset", type=str, default='webvideo10m')
+    parser.add_argument("--test-run", type=bool, default='')
+    parser.add_argument("--data-path", type=str, default="/path/to/datasets/preprocessed_ffs/train/videos/")
+    parser.add_argument("--frame-data-path", type=str, default="/path/to/datasets/preprocessed_ffs/train/images/")
+    parser.add_argument("--frame-data-txt", type=str, default="/path/to/datasets/faceForensics_v1/train_list.txt")
+    config = parser.parse_args()
+    temporal_sample = video_transforms.TemporalRandomCrop(config.num_frames * config.frame_interval)
+    transform_webvideo = transform.Compose([
+            video_transforms.ToTensorVideo(),
+            transform.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        ])
+    dataset = FaceForensicsImages(config, transform=transform_webvideo, temporal_sample=temporal_sample)
+    dataloader = Data.DataLoader(dataset=dataset, batch_size=1, shuffle=True, num_workers=4)
+    for i, video_data in enumerate(dataloader):
+        video, video_label = video_data['video'], video_data['video_name']
+        # print(video_label)
+        # print(image_label)
+        print(video.shape)
+        print(video_label)
+        # video_ = ((video[0] * 0.5 + 0.5) * 255).add_(0.5).clamp_(0, 255).to(dtype=torch.uint8).cpu().permute(0, 2, 3, 1)
+        # print(video_.shape)
+        # try:
+        #     torchvision.io.write_video(f'./test/{i:03d}_{video_label}.mp4', video_[:16], fps=8)
+        # except:
+        #     pass
+        # if i % 100 == 0 and i != 0:
+        #     break
+    print('Done!')

datasets/sky_datasets.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import os
+import torch
+import random
+import torch.utils.data as data
+import numpy as np
+from PIL import Image
+IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']
+def is_image_file(filename):
+    return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
+class Sky(data.Dataset):
+    def __init__(self, configs, transform, temporal_sample=None, train=True):
+        self.configs = configs
+        self.data_path = configs.data_path
+        self.transform = transform
+        self.temporal_sample = temporal_sample
+        self.target_video_len = self.configs.num_frames
+        self.frame_interval = self.configs.frame_interval
+        self.data_all = self.load_video_frames(self.data_path)
+    def __getitem__(self, index):
+        vframes = self.data_all[index]
+        total_frames = len(vframes)
+        # Sampling video frames
+        start_frame_ind, end_frame_ind = self.temporal_sample(total_frames)
+        assert end_frame_ind - start_frame_ind >= self.target_video_len
+        frame_indice = np.linspace(start_frame_ind, end_frame_ind-1, num=self.target_video_len, dtype=int) # start, stop, num=50
+        select_video_frames = vframes[frame_indice[0]: frame_indice[-1]+1: self.frame_interval]
+        video_frames = []
+        for path in select_video_frames:
+            video_frame = torch.as_tensor(np.array(Image.open(path), dtype=np.uint8, copy=True)).unsqueeze(0)
+            video_frames.append(video_frame)
+        video_clip = torch.cat(video_frames, dim=0).permute(0, 3, 1, 2)
+        video_clip = self.transform(video_clip)
+        return {'video': video_clip, 'video_name': 1}
+    def __len__(self):
+        return self.video_num
+    def load_video_frames(self, dataroot):
+        data_all = []
+        frame_list = os.walk(dataroot)
+        for _, meta in enumerate(frame_list):
+            root = meta[0]
+            try:
+                frames = sorted(meta[2], key=lambda item: int(item.split('.')[0].split('_')[-1]))
+            except:
+                print(meta[0]) # root
+                print(meta[2]) # files
+            frames = [os.path.join(root, item) for item in frames if is_image_file(item)]
+            if len(frames) > max(0, self.target_video_len * self.frame_interval): # need all > (16 * frame-interval) videos
+            # if len(frames) >= max(0, self.target_video_len): # need all > 16 frames videos
+                data_all.append(frames)
+        self.video_num = len(data_all)
+        return data_all
+if __name__ == '__main__':
+    import argparse
+    import torchvision
+    import video_transforms
+    import torch.utils.data as data
+    from torchvision import transforms
+    from torchvision.utils import save_image
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num_frames", type=int, default=16)
+    parser.add_argument("--frame_interval", type=int, default=4)
+    parser.add_argument("--data-path", type=str, default="/path/to/datasets/sky_timelapse/sky_train/")
+    config = parser.parse_args()
+    target_video_len = config.num_frames
+    temporal_sample = video_transforms.TemporalRandomCrop(target_video_len * config.frame_interval)
+    trans = transforms.Compose([
+        video_transforms.ToTensorVideo(),
+        # video_transforms.CenterCropVideo(256),
+        video_transforms.CenterCropResizeVideo(256),
+        # video_transforms.RandomHorizontalFlipVideo(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+    ])
+    taichi_dataset = Sky(config, transform=trans, temporal_sample=temporal_sample)
+    print(len(taichi_dataset))
+    taichi_dataloader = data.DataLoader(dataset=taichi_dataset, batch_size=1, shuffle=False, num_workers=1)
+    for i, video_data in enumerate(taichi_dataloader):
+        print(video_data['video'].shape)
+        # print(video_data.dtype)
+        # for i in range(target_video_len):
+        #     save_image(video_data[0][i], os.path.join('./test_data', '%04d.png' % i), normalize=True, value_range=(-1, 1))
+        # video_ = ((video_data[0] * 0.5 + 0.5) * 255).add_(0.5).clamp_(0, 255).to(dtype=torch.uint8).cpu().permute(0, 2, 3, 1)
+        # torchvision.io.write_video('./test_data' + 'test.mp4', video_, fps=8)
+        # exit()

datasets/sky_image_datasets.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import os
+import torch
+import random
+import torch.utils.data as data
+import numpy as np
+import copy
+from PIL import Image
+IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']
+def is_image_file(filename):
+    return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
+class SkyImages(data.Dataset):
+    def __init__(self, configs, transform, temporal_sample=None, train=True):
+        self.configs = configs
+        self.data_path = configs.data_path
+        self.transform = transform
+        self.temporal_sample = temporal_sample
+        self.target_video_len = self.configs.num_frames
+        self.frame_interval = self.configs.frame_interval
+        self.data_all, self.video_frame_all = self.load_video_frames(self.data_path)
+        # sky video frames
+        random.shuffle(self.video_frame_all)
+        self.use_image_num = configs.use_image_num
+    def __getitem__(self, index):
+        video_index = index % self.video_num
+        vframes = self.data_all[video_index]
+        total_frames = len(vframes)
+        # Sampling video frames
+        start_frame_ind, end_frame_ind = self.temporal_sample(total_frames)
+        assert end_frame_ind - start_frame_ind >= self.target_video_len
+        frame_indice = np.linspace(start_frame_ind, end_frame_ind-1, num=self.target_video_len, dtype=int) # start, stop, num=50
+        select_video_frames = vframes[frame_indice[0]: frame_indice[-1]+1: self.frame_interval]
+        video_frames = []
+        for path in select_video_frames:
+            video_frame = torch.as_tensor(np.array(Image.open(path), dtype=np.uint8, copy=True)).unsqueeze(0)
+            video_frames.append(video_frame)
+        video_clip = torch.cat(video_frames, dim=0).permute(0, 3, 1, 2)
+        video_clip = self.transform(video_clip)
+        # get video frames
+        images = []
+        for i in range(self.use_image_num):
+            while True:
+                try:
+                    video_frame_path = self.video_frame_all[index+i]
+                    image = torch.as_tensor(np.array(Image.open(video_frame_path), dtype=np.uint8, copy=True)).unsqueeze(0)
+                    images.append(image)
+                    break
+                except Exception as e:
+                    index = random.randint(0, self.video_frame_num - self.use_image_num)
+        images =  torch.cat(images, dim=0).permute(0, 3, 1, 2)
+        images = self.transform(images)
+        assert len(images) == self.use_image_num
+        video_cat = torch.cat([video_clip, images], dim=0)
+        return {'video': video_cat, 'video_name': 1}
+    def __len__(self):
+        return self.video_frame_num
+    def load_video_frames(self, dataroot):
+        data_all = []
+        frames_all = []
+        frame_list = os.walk(dataroot)
+        for _, meta in enumerate(frame_list):
+            root = meta[0]
+            try:
+                frames = sorted(meta[2], key=lambda item: int(item.split('.')[0].split('_')[-1]))
+            except:
+                print(meta[0]) # root
+                print(meta[2]) # files
+            frames = [os.path.join(root, item) for item in frames if is_image_file(item)]
+            if len(frames) > max(0, self.target_video_len * self.frame_interval): # need all > (16 * frame-interval) videos
+            # if len(frames) >= max(0, self.target_video_len): # need all > 16 frames videos
+                data_all.append(frames)
+                for frame in frames:
+                    frames_all.append(frame)
+        self.video_num = len(data_all)
+        self.video_frame_num = len(frames_all)
+        return data_all, frames_all
+if __name__ == '__main__':
+    import argparse
+    import torchvision
+    import video_transforms
+    import torch.utils.data as data
+    from torchvision import transforms
+    from torchvision.utils import save_image
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num_frames", type=int, default=16)
+    parser.add_argument("--frame_interval", type=int, default=3)
+    parser.add_argument("--data-path", type=str, default="/path/to/datasets/sky_timelapse/sky_train/")
+    parser.add_argument("--use-image-num", type=int, default=5)
+    config = parser.parse_args()
+    target_video_len = config.num_frames
+    temporal_sample = video_transforms.TemporalRandomCrop(target_video_len * config.frame_interval)
+    trans = transforms.Compose([
+        video_transforms.ToTensorVideo(),
+        # video_transforms.CenterCropVideo(256),
+        video_transforms.CenterCropResizeVideo(256),
+        # video_transforms.RandomHorizontalFlipVideo(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+    ])
+    taichi_dataset = SkyImages(config, transform=trans, temporal_sample=temporal_sample)
+    print(len(taichi_dataset))
+    taichi_dataloader = data.DataLoader(dataset=taichi_dataset, batch_size=1, shuffle=False, num_workers=1)
+    for i, video_data in enumerate(taichi_dataloader):
+        print(video_data['video'].shape)
+        # print(video_data.dtype)
+        # for i in range(target_video_len):
+        #     save_image(video_data[0][i], os.path.join('./test_data', '%04d.png' % i), normalize=True, value_range=(-1, 1))
+        # video_ = ((video_data[0] * 0.5 + 0.5) * 255).add_(0.5).clamp_(0, 255).to(dtype=torch.uint8).cpu().permute(0, 2, 3, 1)
+        # torchvision.io.write_video('./test_data' + 'test.mp4', video_, fps=8)
+        # exit()

datasets/taichi_datasets.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import torch
+import random
+import torch.utils.data as data
+import numpy as np
+import io
+import json
+from PIL import Image
+IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']
+def is_image_file(filename):
+    return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
+class Taichi(data.Dataset):
+    def __init__(self, configs, transform, temporal_sample=None, train=True):
+        self.configs = configs
+        self.data_path = configs.data_path
+        self.transform = transform
+        self.temporal_sample = temporal_sample
+        self.target_video_len = self.configs.num_frames
+        self.frame_interval = self.configs.frame_interval
+        self.data_all = self.load_video_frames(self.data_path)
+        self.video_num = len(self.data_all)
+    def __getitem__(self, index):
+        vframes = self.data_all[index]
+        total_frames = len(vframes)
+        # Sampling video frames
+        start_frame_ind, end_frame_ind = self.temporal_sample(total_frames)
+        assert end_frame_ind - start_frame_ind >= self.target_video_len
+        frame_indice = np.linspace(start_frame_ind, end_frame_ind-1, self.target_video_len, dtype=int)
+        select_video_frames = vframes[frame_indice[0]: frame_indice[-1]+1: self.frame_interval]
+        video_frames = []
+        for path in select_video_frames:
+            image = Image.open(path).convert('RGB')
+            video_frame = torch.as_tensor(np.array(image, dtype=np.uint8, copy=True)).unsqueeze(0)
+            video_frames.append(video_frame)
+        video_clip = torch.cat(video_frames, dim=0).permute(0, 3, 1, 2)
+        video_clip = self.transform(video_clip)
+        # return video_clip, 1
+        return {'video': video_clip, 'video_name': 1}
+    def __len__(self):
+        return self.video_num
+    def load_video_frames(self, dataroot):
+        data_all = []
+        frame_list = os.walk(dataroot)
+        for _, meta in enumerate(frame_list):
+            root = meta[0]
+            try:
+                frames = sorted(meta[2], key=lambda item: int(item.split('.')[0].split('_')[-1]))
+            except:
+                print(meta[0], meta[2])
+            frames = [os.path.join(root, item) for item in frames if is_image_file(item)]
+            # if len(frames) > max(0, self.sequence_length * self.sample_every_n_frames):
+            if len(frames) != 0:
+                data_all.append(frames)
+        # self.video_num = len(data_all)
+        return data_all
+if __name__ == '__main__':
+    import argparse
+    import torchvision
+    import video_transforms
+    import torch.utils.data as data
+    from torchvision import transforms
+    from torchvision.utils import save_image
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num_frames", type=int, default=16)
+    parser.add_argument("--frame_interval", type=int, default=4)
+    parser.add_argument("--load_fron_ceph", type=bool, default=True)
+    parser.add_argument("--data-path", type=str, default="/path/to/datasets/taichi/taichi-256/frames/train")
+    config = parser.parse_args()
+    target_video_len = config.num_frames
+    temporal_sample = video_transforms.TemporalRandomCrop(target_video_len * config.frame_interval)
+    trans = transforms.Compose([
+        video_transforms.ToTensorVideo(),
+        video_transforms.RandomHorizontalFlipVideo(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+    ])
+    taichi_dataset = Taichi(config, transform=trans, temporal_sample=temporal_sample)
+    taichi_dataloader = data.DataLoader(dataset=taichi_dataset, batch_size=1, shuffle=False, num_workers=1)
+    for i, video_data in enumerate(taichi_dataloader):
+        print(video_data['video'].shape)
+        # print(video_data.dtype)
+        # for i in range(target_video_len):
+        #     save_image(video_data[0][i], os.path.join('./test_data', '%04d.png' % i), normalize=True, value_range=(-1, 1))
+        # video_ = ((video_data[0] * 0.5 + 0.5) * 255).add_(0.5).clamp_(0, 255).to(dtype=torch.uint8).cpu().permute(0, 2, 3, 1)
+        # torchvision.io.write_video('./test_data' + 'test.mp4', video_, fps=8)
+        # exit()

datasets/taichi_image_datasets.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+import torch
+import random
+import torch.utils.data as data
+import numpy as np
+import io
+import json
+from PIL import Image
+IMG_EXTENSIONS = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']
+def is_image_file(filename):
+    return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
+class TaichiImages(data.Dataset):
+    def __init__(self, configs, transform, temporal_sample=None, train=True):
+        self.configs = configs
+        self.data_path = configs.data_path
+        self.transform = transform
+        self.temporal_sample = temporal_sample
+        self.target_video_len = self.configs.num_frames
+        self.frame_interval = self.configs.frame_interval
+        self.data_all, self.video_frame_all = self.load_video_frames(self.data_path)
+        self.video_num = len(self.data_all)
+        self.video_frame_num = len(self.video_frame_all)
+        # sky video frames
+        random.shuffle(self.video_frame_all)
+        self.use_image_num = configs.use_image_num
+    def __getitem__(self, index):
+        video_index = index % self.video_num
+        vframes = self.data_all[video_index]
+        total_frames = len(vframes)
+        # Sampling video frames
+        start_frame_ind, end_frame_ind = self.temporal_sample(total_frames)
+        assert end_frame_ind - start_frame_ind >= self.target_video_len
+        frame_indice = np.linspace(start_frame_ind, end_frame_ind-1, self.target_video_len, dtype=int)
+        # print(frame_indice)
+        select_video_frames = vframes[frame_indice[0]: frame_indice[-1]+1: self.frame_interval]
+        video_frames = []
+        for path in select_video_frames:
+            image = Image.open(path).convert('RGB')
+            video_frame = torch.as_tensor(np.array(image, dtype=np.uint8, copy=True)).unsqueeze(0)
+            video_frames.append(video_frame)
+        video_clip = torch.cat(video_frames, dim=0).permute(0, 3, 1, 2)
+        video_clip = self.transform(video_clip)
+        # get video frames
+        images = []
+        for i in range(self.use_image_num):
+            while True:
+                try:
+                    video_frame_path = self.video_frame_all[index+i]
+                    image_path = os.path.join(self.data_path, video_frame_path)
+                    image = Image.open(image_path).convert('RGB')
+                    image = torch.as_tensor(np.array(image, dtype=np.uint8, copy=True)).unsqueeze(0)
+                    images.append(image)
+                    break
+                except Exception as e:
+                    index = random.randint(0, self.video_frame_num - self.use_image_num)
+        images =  torch.cat(images, dim=0).permute(0, 3, 1, 2)
+        images = self.transform(images)
+        assert len(images) == self.use_image_num
+        video_cat = torch.cat([video_clip, images], dim=0)
+        return {'video': video_cat, 'video_name': 1}
+    def __len__(self):
+        return self.video_frame_num
+    def load_video_frames(self, dataroot):
+        data_all = []
+        frames_all = []
+        frame_list = os.walk(dataroot)
+        for _, meta in enumerate(frame_list):
+            root = meta[0]
+            try:
+                frames = sorted(meta[2], key=lambda item: int(item.split('.')[0].split('_')[-1]))
+            except:
+                print(meta[0], meta[2])
+            frames = [os.path.join(root, item) for item in frames if is_image_file(item)]
+            # if len(frames) > max(0, self.sequence_length * self.sample_every_n_frames):
+            if len(frames) != 0:
+                data_all.append(frames)
+                for frame in frames:
+                    frames_all.append(frame)
+        # self.video_num = len(data_all)
+        return data_all, frames_all
+if __name__ == '__main__':
+    import argparse
+    import torchvision
+    import video_transforms
+    import torch.utils.data as data
+    from torchvision import transforms
+    from torchvision.utils import save_image
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num_frames", type=int, default=16)
+    parser.add_argument("--frame_interval", type=int, default=4)
+    parser.add_argument("--load_from_ceph", type=bool, default=True)
+    parser.add_argument("--data-path", type=str, default="/path/to/datasets/taichi/taichi-256/frames/train")
+    parser.add_argument("--use-image-num", type=int, default=5)
+    config = parser.parse_args()
+    target_video_len = config.num_frames
+    temporal_sample = video_transforms.TemporalRandomCrop(target_video_len * config.frame_interval)
+    trans = transforms.Compose([
+        video_transforms.ToTensorVideo(),
+        video_transforms.RandomHorizontalFlipVideo(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+    ])
+    taichi_dataset = TaichiImages(config, transform=trans, temporal_sample=temporal_sample)
+    print(len(taichi_dataset))
+    taichi_dataloader = data.DataLoader(dataset=taichi_dataset, batch_size=1, shuffle=False, num_workers=1)
+    for i, video_data in enumerate(taichi_dataloader):
+        print(video_data['video'].shape)
+        # print(video_data.dtype)
+        # for i in range(target_video_len):
+        #     save_image(video_data[0][i], os.path.join('./test_data', '%04d.png' % i), normalize=True, value_range=(-1, 1))
+        video_ = ((video_data[0] * 0.5 + 0.5) * 255).add_(0.5).clamp_(0, 255).to(dtype=torch.uint8).cpu().permute(0, 2, 3, 1)
+        torchvision.io.write_video('./test_data' + 'test.mp4', video_, fps=8)
+        exit()

datasets/ucf101_datasets.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import os
+import re
+import json
+import torch
+import decord
+import torchvision
+import numpy as np
+from PIL import Image
+from einops import rearrange
+from typing import Dict, List, Tuple
+class_labels_map = None
+cls_sample_cnt = None
+class_labels_map = None
+cls_sample_cnt = None
+def temporal_sampling(frames, start_idx, end_idx, num_samples):
+    """
+    Given the start and end frame index, sample num_samples frames between
+    the start and end with equal interval.
+    Args:
+        frames (tensor): a tensor of video frames, dimension is
+            `num video frames` x `channel` x `height` x `width`.
+        start_idx (int): the index of the start frame.
+        end_idx (int): the index of the end frame.
+        num_samples (int): number of frames to sample.
+    Returns:
+        frames (tersor): a tensor of temporal sampled video frames, dimension is
+            `num clip frames` x `channel` x `height` x `width`.
+    """
+    index = torch.linspace(start_idx, end_idx, num_samples)
+    index = torch.clamp(index, 0, frames.shape[0] - 1).long()
+    frames = torch.index_select(frames, 0, index)
+    return frames
+def get_filelist(file_path):
+    Filelist = []
+    for home, dirs, files in os.walk(file_path):
+        for filename in files:
+            # 文件名列表，包含完整路径
+            Filelist.append(os.path.join(home, filename))
+            # # 文件名列表，只包含文件名
+            # Filelist.append( filename)
+    return Filelist
+def load_annotation_data(data_file_path):
+    with open(data_file_path, 'r') as data_file:
+        return json.load(data_file)
+def get_class_labels(num_class, anno_pth='./k400_classmap.json'):
+    global class_labels_map, cls_sample_cnt
+    if class_labels_map is not None:
+        return class_labels_map, cls_sample_cnt
+    else:
+        cls_sample_cnt = {}
+        class_labels_map = load_annotation_data(anno_pth)
+        for cls in class_labels_map:
+            cls_sample_cnt[cls] = 0
+        return class_labels_map, cls_sample_cnt
+def load_annotations(ann_file, num_class, num_samples_per_cls):
+    dataset = []
+    class_to_idx, cls_sample_cnt = get_class_labels(num_class)
+    with open(ann_file, 'r') as fin:
+        for line in fin:
+            line_split = line.strip().split('\t')
+            sample = {}
+            idx = 0
+            # idx for frame_dir
+            frame_dir = line_split[idx]
+            sample['video'] = frame_dir
+            idx += 1
+            # idx for label[s]
+            label = [x for x in line_split[idx:]]
+            assert label, f'missing label in line: {line}'
+            assert len(label) == 1
+            class_name = label[0]
+            class_index = int(class_to_idx[class_name])
+            # choose a class subset of whole dataset
+            if class_index < num_class:
+                sample['label'] = class_index
+                if cls_sample_cnt[class_name] < num_samples_per_cls:
+                    dataset.append(sample)
+                    cls_sample_cnt[class_name]+=1
+    return dataset
+def find_classes(directory: str) -> Tuple[List[str], Dict[str, int]]:
+    """Finds the class folders in a dataset.
+    See :class:`DatasetFolder` for details.
+    """
+    classes = sorted(entry.name for entry in os.scandir(directory) if entry.is_dir())
+    if not classes:
+        raise FileNotFoundError(f"Couldn't find any class folder in {directory}.")
+    class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
+    return classes, class_to_idx
+class DecordInit(object):
+    """Using Decord(https://github.com/dmlc/decord) to initialize the video_reader."""
+    def __init__(self, num_threads=1):
+        self.num_threads = num_threads
+        self.ctx = decord.cpu(0)
+    def __call__(self, filename):
+        """Perform the Decord initialization.
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        reader = decord.VideoReader(filename,
+                                    ctx=self.ctx,
+                                    num_threads=self.num_threads)
+        return reader
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'sr={self.sr},'
+                    f'num_threads={self.num_threads})')
+        return repr_str
+class UCF101(torch.utils.data.Dataset):
+    """Load the UCF101 video files
+    Args:
+        target_video_len (int): the number of video frames will be load.
+        align_transform (callable): Align different videos in a specified size.
+        temporal_sample (callable): Sample the target length of a video.
+    """
+    def __init__(self,
+                 configs,
+                 transform=None,
+                 temporal_sample=None):
+        self.configs = configs
+        self.data_path = configs.data_path
+        self.video_lists = get_filelist(configs.data_path)
+        self.transform = transform
+        self.temporal_sample = temporal_sample
+        self.target_video_len = self.configs.num_frames
+        self.v_decoder = DecordInit()
+        self.classes, self.class_to_idx = find_classes(self.data_path)
+        # print(self.class_to_idx)
+        # exit()
+    def __getitem__(self, index):
+        path = self.video_lists[index]
+        class_name = path.split('/')[-2]
+        class_index = self.class_to_idx[class_name]
+        vframes, aframes, info = torchvision.io.read_video(filename=path, pts_unit='sec', output_format='TCHW')
+        total_frames = len(vframes)
+        # Sampling video frames
+        start_frame_ind, end_frame_ind = self.temporal_sample(total_frames)
+        assert end_frame_ind - start_frame_ind >= self.target_video_len
+        frame_indice = np.linspace(start_frame_ind, end_frame_ind-1, self.target_video_len, dtype=int)
+        # print(frame_indice)
+        video = vframes[frame_indice] #
+        video = self.transform(video) # T C H W
+        return {'video': video, 'video_name': class_index}
+    def __len__(self):
+        return len(self.video_lists)
+if __name__ == '__main__':
+    import argparse
+    import video_transforms
+    import torch.utils.data as Data
+    import torchvision.transforms as transforms
+    from PIL import Image
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num_frames", type=int, default=16)
+    parser.add_argument("--frame_interval", type=int, default=1)
+    # parser.add_argument("--data-path", type=str, default="/nvme/share_data/datasets/UCF101/videos")
+    parser.add_argument("--data-path", type=str, default="/path/to/datasets/UCF101/videos/")
+    config = parser.parse_args()
+    temporal_sample = video_transforms.TemporalRandomCrop(config.num_frames * config.frame_interval)
+    transform_ucf101 = transforms.Compose([
+            video_transforms.ToTensorVideo(), # TCHW
+            video_transforms.RandomHorizontalFlipVideo(),
+            video_transforms.UCFCenterCropVideo(256),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+        ])
+    ffs_dataset = UCF101(config, transform=transform_ucf101, temporal_sample=temporal_sample)
+    ffs_dataloader = Data.DataLoader(dataset=ffs_dataset, batch_size=6, shuffle=False, num_workers=1)
+    # for i, video_data in enumerate(ffs_dataloader):
+    for video_data in ffs_dataloader:
+        print(type(video_data))
+        video = video_data['video']
+        video_name = video_data['video_name']
+        print(video.shape)
+        print(video_name)
+        # print(video_data[2])
+        # for i in range(16):
+        #     img0 = rearrange(video_data[0][0][i], 'c h w -> h w c')
+        #     print('Label: {}'.format(video_data[1]))
+        #     print(img0.shape)
+        #     img0 = Image.fromarray(np.uint8(img0 * 255))
+        #     img0.save('./img{}.jpg'.format(i))
+        exit()

datasets/ucf101_image_datasets.py ADDED Viewed

	@@ -0,0 +1,279 @@

+import os, io
+import re
+import json
+import torch
+import decord
+import torchvision
+import numpy as np
+from PIL import Image
+from einops import rearrange
+from typing import Dict, List, Tuple
+from torchvision import transforms
+import random
+class_labels_map = None
+cls_sample_cnt = None
+class_labels_map = None
+cls_sample_cnt = None
+def temporal_sampling(frames, start_idx, end_idx, num_samples):
+    """
+    Given the start and end frame index, sample num_samples frames between
+    the start and end with equal interval.
+    Args:
+        frames (tensor): a tensor of video frames, dimension is
+            `num video frames` x `channel` x `height` x `width`.
+        start_idx (int): the index of the start frame.
+        end_idx (int): the index of the end frame.
+        num_samples (int): number of frames to sample.
+    Returns:
+        frames (tersor): a tensor of temporal sampled video frames, dimension is
+            `num clip frames` x `channel` x `height` x `width`.
+    """
+    index = torch.linspace(start_idx, end_idx, num_samples)
+    index = torch.clamp(index, 0, frames.shape[0] - 1).long()
+    frames = torch.index_select(frames, 0, index)
+    return frames
+def get_filelist(file_path):
+    Filelist = []
+    for home, dirs, files in os.walk(file_path):
+        for filename in files:
+            Filelist.append(os.path.join(home, filename))
+            # Filelist.append( filename)
+    return Filelist
+def load_annotation_data(data_file_path):
+    with open(data_file_path, 'r') as data_file:
+        return json.load(data_file)
+def get_class_labels(num_class, anno_pth='./k400_classmap.json'):
+    global class_labels_map, cls_sample_cnt
+    if class_labels_map is not None:
+        return class_labels_map, cls_sample_cnt
+    else:
+        cls_sample_cnt = {}
+        class_labels_map = load_annotation_data(anno_pth)
+        for cls in class_labels_map:
+            cls_sample_cnt[cls] = 0
+        return class_labels_map, cls_sample_cnt
+def load_annotations(ann_file, num_class, num_samples_per_cls):
+    dataset = []
+    class_to_idx, cls_sample_cnt = get_class_labels(num_class)
+    with open(ann_file, 'r') as fin:
+        for line in fin:
+            line_split = line.strip().split('\t')
+            sample = {}
+            idx = 0
+            # idx for frame_dir
+            frame_dir = line_split[idx]
+            sample['video'] = frame_dir
+            idx += 1
+            # idx for label[s]
+            label = [x for x in line_split[idx:]]
+            assert label, f'missing label in line: {line}'
+            assert len(label) == 1
+            class_name = label[0]
+            class_index = int(class_to_idx[class_name])
+            # choose a class subset of whole dataset
+            if class_index < num_class:
+                sample['label'] = class_index
+                if cls_sample_cnt[class_name] < num_samples_per_cls:
+                    dataset.append(sample)
+                    cls_sample_cnt[class_name]+=1
+    return dataset
+def find_classes(directory: str) -> Tuple[List[str], Dict[str, int]]:
+    """Finds the class folders in a dataset.
+    See :class:`DatasetFolder` for details.
+    """
+    classes = sorted(entry.name for entry in os.scandir(directory) if entry.is_dir())
+    if not classes:
+        raise FileNotFoundError(f"Couldn't find any class folder in {directory}.")
+    class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
+    return classes, class_to_idx
+class DecordInit(object):
+    """Using Decord(https://github.com/dmlc/decord) to initialize the video_reader."""
+    def __init__(self, num_threads=1):
+        self.num_threads = num_threads
+        self.ctx = decord.cpu(0)
+    def __call__(self, filename):
+        """Perform the Decord initialization.
+        Args:
+            results (dict): The resulting dict to be modified and passed
+                to the next transform in pipeline.
+        """
+        reader = decord.VideoReader(filename,
+                                    ctx=self.ctx,
+                                    num_threads=self.num_threads)
+        return reader
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'sr={self.sr},'
+                    f'num_threads={self.num_threads})')
+        return repr_str
+class UCF101Images(torch.utils.data.Dataset):
+    """Load the UCF101 video files
+    Args:
+        target_video_len (int): the number of video frames will be load.
+        align_transform (callable): Align different videos in a specified size.
+        temporal_sample (callable): Sample the target length of a video.
+    """
+    def __init__(self,
+                 configs,
+                 transform=None,
+                 temporal_sample=None):
+        self.configs = configs
+        self.data_path = configs.data_path
+        self.video_lists = get_filelist(configs.data_path)
+        self.transform = transform
+        self.temporal_sample = temporal_sample
+        self.target_video_len = self.configs.num_frames
+        self.v_decoder = DecordInit()
+        self.classes, self.class_to_idx = find_classes(self.data_path)
+        self.video_num = len(self.video_lists)
+        # ucf101 video frames
+        self.frame_data_path = configs.frame_data_path # important
+        self.video_frame_txt = configs.frame_data_txt
+        self.video_frame_files = [frame_file.strip() for frame_file in open(self.video_frame_txt)]
+        random.shuffle(self.video_frame_files)
+        self.use_image_num = configs.use_image_num
+        self.image_tranform = transforms.Compose([
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+        ])
+        self.video_frame_num = len(self.video_frame_files)
+    def __getitem__(self, index):
+        video_index = index % self.video_num
+        path = self.video_lists[video_index]
+        class_name = path.split('/')[-2]
+        class_index = self.class_to_idx[class_name]
+        vframes, aframes, info = torchvision.io.read_video(filename=path, pts_unit='sec', output_format='TCHW')
+        total_frames = len(vframes)
+        # Sampling video frames
+        start_frame_ind, end_frame_ind = self.temporal_sample(total_frames)
+        assert end_frame_ind - start_frame_ind >= self.target_video_len
+        frame_indice = np.linspace(start_frame_ind, end_frame_ind-1, self.target_video_len, dtype=int)
+        video = vframes[frame_indice]
+        # videotransformer data proprecess
+        video = self.transform(video) # T C H W
+        images = []
+        image_names = []
+        for i in range(self.use_image_num):
+            while True:
+                try:
+                    video_frame_path = self.video_frame_files[index+i]
+                    image_class_name = video_frame_path.split('_')[1]
+                    image_class_index = self.class_to_idx[image_class_name]
+                    video_frame_path = os.path.join(self.frame_data_path, video_frame_path)
+                    image = Image.open(video_frame_path).convert('RGB')
+                    image = self.image_tranform(image).unsqueeze(0)
+                    images.append(image)
+                    image_names.append(str(image_class_index))
+                    break
+                except Exception as e:
+                    index = random.randint(0, self.video_frame_num - self.use_image_num)
+        images =  torch.cat(images, dim=0)
+        assert len(images) == self.use_image_num
+        assert len(image_names) == self.use_image_num
+        image_names = '====='.join(image_names)
+        video_cat = torch.cat([video, images], dim=0)
+        return {'video': video_cat,
+                'video_name': class_index,
+                'image_name': image_names}
+    def __len__(self):
+        return self.video_frame_num
+if __name__ == '__main__':
+    import argparse
+    import video_transforms
+    import torch.utils.data as Data
+    import torchvision.transforms as transforms
+    from PIL import Image
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num_frames", type=int, default=16)
+    parser.add_argument("--frame_interval", type=int, default=3)
+    parser.add_argument("--use-image-num", type=int, default=5)
+    parser.add_argument("--data-path", type=str, default="/path/to/datasets/UCF101/videos/")
+    parser.add_argument("--frame-data-path", type=str, default="/path/to/datasets/preprocessed_ffs/train/images/")
+    parser.add_argument("--frame-data-txt", type=str, default="/path/to/datasets/UCF101/train_256_list.txt")
+    config = parser.parse_args()
+    temporal_sample = video_transforms.TemporalRandomCrop(config.num_frames * config.frame_interval)
+    transform_ucf101 = transforms.Compose([
+            video_transforms.ToTensorVideo(), # TCHW
+            video_transforms.RandomHorizontalFlipVideo(),
+            video_transforms.UCFCenterCropVideo(256),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+        ])
+    ffs_dataset = UCF101Images(config, transform=transform_ucf101, temporal_sample=temporal_sample)
+    ffs_dataloader = Data.DataLoader(dataset=ffs_dataset, batch_size=6, shuffle=False, num_workers=1)
+    # for i, video_data in enumerate(ffs_dataloader):
+    for video_data in ffs_dataloader:
+        # print(type(video_data))
+        video = video_data['video']
+        # video_name = video_data['video_name']
+        print(video.shape)
+        print(video_data['image_name'])
+        image_name = video_data['image_name']
+        image_names = []
+        for caption in image_name:
+            single_caption = [int(item) for item in caption.split('=====')]
+            image_names.append(torch.as_tensor(single_caption))
+        print(image_names)
+        # print(video_name)
+        # print(video_data[2])
+        # for i in range(16):
+        #     img0 = rearrange(video_data[0][0][i], 'c h w -> h w c')
+        #     print('Label: {}'.format(video_data[1]))
+        #     print(img0.shape)
+        #     img0 = Image.fromarray(np.uint8(img0 * 255))
+        #     img0.save('./img{}.jpg'.format(i))

datasets/video_transforms.py ADDED Viewed

	@@ -0,0 +1,482 @@

+import torch
+import random
+import numbers
+from torchvision.transforms import RandomCrop, RandomResizedCrop
+def _is_tensor_video_clip(clip):
+    if not torch.is_tensor(clip):
+        raise TypeError("clip should be Tensor. Got %s" % type(clip))
+    if not clip.ndimension() == 4:
+        raise ValueError("clip should be 4D. Got %dD" % clip.dim())
+    return True
+def center_crop_arr(pil_image, image_size):
+    """
+    Center cropping implementation from ADM.
+    https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py#L126
+    """
+    while min(*pil_image.size) >= 2 * image_size:
+        pil_image = pil_image.resize(
+            tuple(x // 2 for x in pil_image.size), resample=Image.BOX
+        )
+    scale = image_size / min(*pil_image.size)
+    pil_image = pil_image.resize(
+        tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
+    )
+    arr = np.array(pil_image)
+    crop_y = (arr.shape[0] - image_size) // 2
+    crop_x = (arr.shape[1] - image_size) // 2
+    return Image.fromarray(arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size])
+def crop(clip, i, j, h, w):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+    """
+    if len(clip.size()) != 4:
+        raise ValueError("clip should be a 4D tensor")
+    return clip[..., i : i + h, j : j + w]
+def resize(clip, target_size, interpolation_mode):
+    if len(target_size) != 2:
+        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
+    return torch.nn.functional.interpolate(clip, size=target_size, mode=interpolation_mode, align_corners=False)
+def resize_scale(clip, target_size, interpolation_mode):
+    if len(target_size) != 2:
+        raise ValueError(f"target size should be tuple (height, width), instead got {target_size}")
+    H, W = clip.size(-2), clip.size(-1)
+    scale_ = target_size[0] / min(H, W)
+    return torch.nn.functional.interpolate(clip, scale_factor=scale_, mode=interpolation_mode, align_corners=False)
+def resized_crop(clip, i, j, h, w, size, interpolation_mode="bilinear"):
+    """
+    Do spatial cropping and resizing to the video clip
+    Args:
+        clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        i (int): i in (i,j) i.e coordinates of the upper left corner.
+        j (int): j in (i,j) i.e coordinates of the upper left corner.
+        h (int): Height of the cropped region.
+        w (int): Width of the cropped region.
+        size (tuple(int, int)): height and width of resized clip
+    Returns:
+        clip (torch.tensor): Resized and cropped clip. Size is (T, C, H, W)
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    clip = crop(clip, i, j, h, w)
+    clip = resize(clip, size, interpolation_mode)
+    return clip
+def center_crop(clip, crop_size):
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    h, w = clip.size(-2), clip.size(-1)
+    th, tw = crop_size
+    if h < th or w < tw:
+        raise ValueError("height and width must be no smaller than crop_size")
+    i = int(round((h - th) / 2.0))
+    j = int(round((w - tw) / 2.0))
+    return crop(clip, i, j, th, tw)
+def center_crop_using_short_edge(clip):
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    h, w = clip.size(-2), clip.size(-1)
+    if h < w:
+        th, tw = h, h
+        i = 0
+        j = int(round((w - tw) / 2.0))
+    else:
+        th, tw = w, w
+        i = int(round((h - th) / 2.0))
+        j = 0
+    return crop(clip, i, j, th, tw)
+def random_shift_crop(clip):
+    '''
+    Slide along the long edge, with the short edge as crop size
+    '''
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    h, w = clip.size(-2), clip.size(-1)
+    if h <= w:
+        long_edge = w
+        short_edge = h
+    else:
+        long_edge = h
+        short_edge =w
+    th, tw = short_edge, short_edge
+    i = torch.randint(0, h - th + 1, size=(1,)).item()
+    j = torch.randint(0, w - tw + 1, size=(1,)).item()
+    return crop(clip, i, j, th, tw)
+def to_tensor(clip):
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    permute the dimensions of clip tensor
+    Args:
+        clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
+    Return:
+        clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
+    """
+    _is_tensor_video_clip(clip)
+    if not clip.dtype == torch.uint8:
+        raise TypeError("clip tensor should have data type uint8. Got %s" % str(clip.dtype))
+    # return clip.float().permute(3, 0, 1, 2) / 255.0
+    return clip.float() / 255.0
+def normalize(clip, mean, std, inplace=False):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
+        mean (tuple): pixel RGB mean. Size is (3)
+        std (tuple): pixel standard deviation. Size is (3)
+    Returns:
+        normalized clip (torch.tensor): Size is (T, C, H, W)
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    if not inplace:
+        clip = clip.clone()
+    mean = torch.as_tensor(mean, dtype=clip.dtype, device=clip.device)
+    # print(mean)
+    std = torch.as_tensor(std, dtype=clip.dtype, device=clip.device)
+    clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
+    return clip
+def hflip(clip):
+    """
+    Args:
+        clip (torch.tensor): Video clip to be normalized. Size is (T, C, H, W)
+    Returns:
+        flipped clip (torch.tensor): Size is (T, C, H, W)
+    """
+    if not _is_tensor_video_clip(clip):
+        raise ValueError("clip should be a 4D torch.tensor")
+    return clip.flip(-1)
+class RandomCropVideo:
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: randomly cropped video clip.
+                size is (T, C, OH, OW)
+        """
+        i, j, h, w = self.get_params(clip)
+        return crop(clip, i, j, h, w)
+    def get_params(self, clip):
+        h, w = clip.shape[-2:]
+        th, tw = self.size
+        if h < th or w < tw:
+            raise ValueError(f"Required crop size {(th, tw)} is larger than input image size {(h, w)}")
+        if w == tw and h == th:
+            return 0, 0, h, w
+        i = torch.randint(0, h - th + 1, size=(1,)).item()
+        j = torch.randint(0, w - tw + 1, size=(1,)).item()
+        return i, j, th, tw
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size})"
+class CenterCropResizeVideo:
+    '''
+    First use the short side for cropping length,
+    center crop video, then resize to the specified size
+    '''
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: scale resized / center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        clip_center_crop = center_crop_using_short_edge(clip)
+        clip_center_crop_resize = resize(clip_center_crop, target_size=self.size, interpolation_mode=self.interpolation_mode)
+        return clip_center_crop_resize
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class UCFCenterCropVideo:
+    '''
+    First scale to the specified size in equal proportion to the short edge,
+    then center cropping
+    '''
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: scale resized / center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        clip_resize = resize_scale(clip=clip, target_size=self.size, interpolation_mode=self.interpolation_mode)
+        clip_center_crop = center_crop(clip_resize, self.size)
+        return clip_center_crop
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class KineticsRandomCropResizeVideo:
+    '''
+    Slide along the long edge, with the short edge as crop size. And resie to the desired size.
+    '''
+    def __init__(
+            self,
+            size,
+            interpolation_mode="bilinear",
+         ):
+        if isinstance(size, tuple):
+                if len(size) != 2:
+                    raise ValueError(f"size should be tuple (height, width), instead got {size}")
+                self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        clip_random_crop = random_shift_crop(clip)
+        clip_resize = resize(clip_random_crop, self.size, self.interpolation_mode)
+        return clip_resize
+class CenterCropVideo:
+    def __init__(
+        self,
+        size,
+        interpolation_mode="bilinear",
+    ):
+        if isinstance(size, tuple):
+            if len(size) != 2:
+                raise ValueError(f"size should be tuple (height, width), instead got {size}")
+            self.size = size
+        else:
+            self.size = (size, size)
+        self.interpolation_mode = interpolation_mode
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Video clip to be cropped. Size is (T, C, H, W)
+        Returns:
+            torch.tensor: center cropped video clip.
+                size is (T, C, crop_size, crop_size)
+        """
+        clip_center_crop = center_crop(clip, self.size)
+        return clip_center_crop
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(size={self.size}, interpolation_mode={self.interpolation_mode}"
+class NormalizeVideo:
+    """
+    Normalize the video clip by mean subtraction and division by standard deviation
+    Args:
+        mean (3-tuple): pixel RGB mean
+        std (3-tuple): pixel RGB standard deviation
+        inplace (boolean): whether do in-place normalization
+    """
+    def __init__(self, mean, std, inplace=False):
+        self.mean = mean
+        self.std = std
+        self.inplace = inplace
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): video clip must be normalized. Size is (C, T, H, W)
+        """
+        return normalize(clip, self.mean, self.std, self.inplace)
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(mean={self.mean}, std={self.std}, inplace={self.inplace})"
+class ToTensorVideo:
+    """
+    Convert tensor data type from uint8 to float, divide value by 255.0 and
+    permute the dimensions of clip tensor
+    """
+    def __init__(self):
+        pass
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor, dtype=torch.uint8): Size is (T, C, H, W)
+        Return:
+            clip (torch.tensor, dtype=torch.float): Size is (T, C, H, W)
+        """
+        return to_tensor(clip)
+    def __repr__(self) -> str:
+        return self.__class__.__name__
+class RandomHorizontalFlipVideo:
+    """
+    Flip the video clip along the horizontal direction with a given probability
+    Args:
+        p (float): probability of the clip being flipped. Default value is 0.5
+    """
+    def __init__(self, p=0.5):
+        self.p = p
+    def __call__(self, clip):
+        """
+        Args:
+            clip (torch.tensor): Size is (T, C, H, W)
+        Return:
+            clip (torch.tensor): Size is (T, C, H, W)
+        """
+        if random.random() < self.p:
+            clip = hflip(clip)
+        return clip
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(p={self.p})"
+#  ------------------------------------------------------------
+#  ---------------------  Sampling  ---------------------------
+#  ------------------------------------------------------------
+class TemporalRandomCrop(object):
+	"""Temporally crop the given frame indices at a random location.
+	Args:
+		size (int): Desired length of frames will be seen in the model.
+	"""
+	def __init__(self, size):
+		self.size = size
+	def __call__(self, total_frames):
+		rand_end = max(0, total_frames - self.size - 1)
+		begin_index = random.randint(0, rand_end)
+		end_index = min(begin_index + self.size, total_frames)
+		return begin_index, end_index
+if __name__ == '__main__':
+    from torchvision import transforms
+    import torchvision.io as io
+    import numpy as np
+    from torchvision.utils import save_image
+    import os
+    vframes, aframes, info = io.read_video(
+    filename='./v_Archery_g01_c03.avi',
+    pts_unit='sec',
+    output_format='TCHW'
+    )
+    trans = transforms.Compose([
+        ToTensorVideo(),
+        RandomHorizontalFlipVideo(),
+        UCFCenterCropVideo(512),
+        # NormalizeVideo(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)
+    ])
+    target_video_len = 32
+    frame_interval = 1
+    total_frames = len(vframes)
+    print(total_frames)
+    temporal_sample = TemporalRandomCrop(target_video_len * frame_interval)
+    # Sampling video frames
+    start_frame_ind, end_frame_ind = temporal_sample(total_frames)
+    # print(start_frame_ind)
+    # print(end_frame_ind)
+    assert end_frame_ind - start_frame_ind >= target_video_len
+    frame_indice = np.linspace(start_frame_ind, end_frame_ind - 1, target_video_len, dtype=int)
+    print(frame_indice)
+    select_vframes = vframes[frame_indice]
+    print(select_vframes.shape)
+    print(select_vframes.dtype)
+    select_vframes_trans = trans(select_vframes)
+    print(select_vframes_trans.shape)
+    print(select_vframes_trans.dtype)
+    select_vframes_trans_int = ((select_vframes_trans * 0.5 + 0.5) * 255).to(dtype=torch.uint8)
+    print(select_vframes_trans_int.dtype)
+    print(select_vframes_trans_int.permute(0, 2, 3, 1).shape)
+    io.write_video('./test.avi', select_vframes_trans_int.permute(0, 2, 3, 1), fps=8)
+    for i in range(target_video_len):
+        save_image(select_vframes_trans[i], os.path.join('./test000', '%04d.png' % i), normalize=True, value_range=(-1, 1))

demo.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import gradio as gr
+import os
+import torch
+import argparse
+import torchvision
+from diffusers.schedulers import (DDIMScheduler, DDPMScheduler, PNDMScheduler,
+                                  EulerDiscreteScheduler, DPMSolverMultistepScheduler,
+                                  HeunDiscreteScheduler, EulerAncestralDiscreteScheduler,
+                                  DEISMultistepScheduler, KDPM2AncestralDiscreteScheduler)
+from diffusers.schedulers.scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
+from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
+from omegaconf import OmegaConf
+from transformers import T5EncoderModel, T5Tokenizer
+import os, sys
+sys.path.append(os.path.split(sys.path[0])[0])
+from sample.pipeline_latte import LattePipeline
+from models import get_models
+# import imageio
+from torchvision.utils import save_image
+import spaces
+parser = argparse.ArgumentParser()
+parser.add_argument("--config", type=str, default="./configs/t2x/t2v_sample.yaml")
+args = parser.parse_args()
+args = OmegaConf.load(args.config)
+torch.set_grad_enabled(False)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+transformer_model = get_models(args).to(device, dtype=torch.float16)
+# state_dict = find_model(args.ckpt)
+# msg, unexp = transformer_model.load_state_dict(state_dict, strict=False)
+if args.enable_vae_temporal_decoder:
+    vae = AutoencoderKLTemporalDecoder.from_pretrained(args.pretrained_model_path, subfolder="vae_temporal_decoder", torch_dtype=torch.float16).to(device)
+else:
+    vae = AutoencoderKL.from_pretrained(args.pretrained_model_path, subfolder="vae", torch_dtype=torch.float16).to(device)
+tokenizer = T5Tokenizer.from_pretrained(args.pretrained_model_path, subfolder="tokenizer")
+text_encoder = T5EncoderModel.from_pretrained(args.pretrained_model_path, subfolder="text_encoder", torch_dtype=torch.float16).to(device)
+# set eval mode
+transformer_model.eval()
+vae.eval()
+text_encoder.eval()
+@spaces.GPU
+def gen_video(text_input, sample_method, scfg_scale, seed, height, width, video_length, diffusion_step):
+    torch.manual_seed(seed)
+    if sample_method == 'DDIM':
+        scheduler = DDIMScheduler.from_pretrained(args.pretrained_model_path,
+                                                    subfolder="scheduler",
+                                                    beta_start=args.beta_start,
+                                                    beta_end=args.beta_end,
+                                                    beta_schedule=args.beta_schedule,
+                                                    variance_type=args.variance_type,
+                                                    clip_sample=False)
+    elif sample_method == 'EulerDiscrete':
+        scheduler = EulerDiscreteScheduler.from_pretrained(args.pretrained_model_path,
+                                                        subfolder="scheduler",
+                                                        beta_start=args.beta_start,
+                                                        beta_end=args.beta_end,
+                                                        beta_schedule=args.beta_schedule,
+                                                        variance_type=args.variance_type)
+    elif sample_method == 'DDPM':
+        scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_path,
+                                                    subfolder="scheduler",
+                                                    beta_start=args.beta_start,
+                                                    beta_end=args.beta_end,
+                                                    beta_schedule=args.beta_schedule,
+                                                    variance_type=args.variance_type,
+                                                    clip_sample=False)
+    elif sample_method == 'DPMSolverMultistep':
+        scheduler = DPMSolverMultistepScheduler.from_pretrained(args.pretrained_model_path,
+                                                    subfolder="scheduler",
+                                                    beta_start=args.beta_start,
+                                                    beta_end=args.beta_end,
+                                                    beta_schedule=args.beta_schedule,
+                                                    variance_type=args.variance_type)
+    elif sample_method == 'DPMSolverSinglestep':
+        scheduler = DPMSolverSinglestepScheduler.from_pretrained(args.pretrained_model_path,
+                                                    subfolder="scheduler",
+                                                    beta_start=args.beta_start,
+                                                    beta_end=args.beta_end,
+                                                    beta_schedule=args.beta_schedule,
+                                                    variance_type=args.variance_type)
+    elif sample_method == 'PNDM':
+        scheduler = PNDMScheduler.from_pretrained(args.pretrained_model_path,
+                                                    subfolder="scheduler",
+                                                    beta_start=args.beta_start,
+                                                    beta_end=args.beta_end,
+                                                    beta_schedule=args.beta_schedule,
+                                                    variance_type=args.variance_type)
+    elif sample_method == 'HeunDiscrete':
+        scheduler = HeunDiscreteScheduler.from_pretrained(args.pretrained_model_path,
+                                                    subfolder="scheduler",
+                                                    beta_start=args.beta_start,
+                                                    beta_end=args.beta_end,
+                                                    beta_schedule=args.beta_schedule,
+                                                    variance_type=args.variance_type)
+    elif sample_method == 'EulerAncestralDiscrete':
+        scheduler = EulerAncestralDiscreteScheduler.from_pretrained(args.pretrained_model_path,
+                                                    subfolder="scheduler",
+                                                    beta_start=args.beta_start,
+                                                    beta_end=args.beta_end,
+                                                    beta_schedule=args.beta_schedule,
+                                                    variance_type=args.variance_type)
+    elif sample_method == 'DEISMultistep':
+        scheduler = DEISMultistepScheduler.from_pretrained(args.pretrained_model_path,
+                                                    subfolder="scheduler",
+                                                    beta_start=args.beta_start,
+                                                    beta_end=args.beta_end,
+                                                    beta_schedule=args.beta_schedule,
+                                                    variance_type=args.variance_type)
+    elif sample_method == 'KDPM2AncestralDiscrete':
+        scheduler = KDPM2AncestralDiscreteScheduler.from_pretrained(args.pretrained_model_path,
+                                                    subfolder="scheduler",
+                                                    beta_start=args.beta_start,
+                                                    beta_end=args.beta_end,
+                                                    beta_schedule=args.beta_schedule,
+                                                    variance_type=args.variance_type)
+    videogen_pipeline = LattePipeline(vae=vae,
+                                    text_encoder=text_encoder,
+                                    tokenizer=tokenizer,
+                                    scheduler=scheduler,
+                                    transformer=transformer_model).to(device)
+    # videogen_pipeline.enable_xformers_memory_efficient_attention()
+    videos = videogen_pipeline(text_input,
+                                video_length=video_length,
+                                height=height,
+                                width=width,
+                                num_inference_steps=diffusion_step,
+                                guidance_scale=scfg_scale,
+                                enable_temporal_attentions=args.enable_temporal_attentions,
+                                num_images_per_prompt=1,
+                                mask_feature=True,
+                                enable_vae_temporal_decoder=args.enable_vae_temporal_decoder
+                                ).video
+    save_path = args.save_img_path + 'temp' + '.mp4'
+    torchvision.io.write_video(save_path, videos[0], fps=8)
+    return save_path
+if not os.path.exists(args.save_img_path):
+    os.makedirs(args.save_img_path)
+intro = """
+<div style="display: flex;align-items: center;justify-content: center">
+    <h1 style="display: inline-block;margin-left: 10px;margin-top: 6px;font-weight: 500">Latte: Latent Diffusion Transformer for Video Generation</h1>
+</div>
+"""
+with gr.Blocks() as demo:
+    # gr.HTML(intro)
+    # with gr.Accordion("README", open=False):
+    #     gr.HTML(
+    #         """
+    #         <p style="font-size: 0.95rem;margin: 0rem;line-height: 1.2em;margin-top:1em;display: inline-block">
+    #             <a href="https://maxin-cn.github.io/latte_project/" target="_blank">project page</a> | <a href="https://arxiv.org/abs/2401.03048" target="_blank">paper</a>
+    #         </p>
+    #         We will continue update Latte.
+    #     """
+    #     )
+    gr.Markdown("<font color=red size=10><center>Latte: Latent Diffusion Transformer for Video Generation</center></font>")
+    gr.Markdown(
+        """<div style="display: flex;align-items: center;justify-content: center">
+        <h2 style="display: inline-block;margin-left: 10px;margin-top: 6px;font-weight: 500">Latte supports both T2I and T2V, and will be continuously updated, so stay tuned!</h2></div>
+        """
+    )
+    gr.Markdown(
+        """<div style="display: flex;align-items: center;justify-content: center">
+        [<a href="https://arxiv.org/abs/2401.03048">Arxiv Report</a>] | [<a href="https://maxin-cn.github.io/latte_project/">Project Page</a>] | [<a href="https://github.com/Vchitect/Latte">Github</a>]</div>
+        """
+    )
+    with gr.Row():
+        with gr.Column(visible=True) as input_raws:
+            with gr.Row():
+                with gr.Column(scale=1.0):
+                    # text_input = gr.Textbox(show_label=True, interactive=True, label="Text prompt").style(container=False)
+                    text_input = gr.Textbox(show_label=True, interactive=True, label="Prompt")
+            # with gr.Row():
+            #     with gr.Column(scale=0.5):
+            #         image_input = gr.Image(show_label=True, interactive=True, label="Reference image").style(container=False)
+            #     with gr.Column(scale=0.5):
+            #         preframe_input = gr.Image(show_label=True, interactive=True, label="First frame").style(container=False)
+            with gr.Row():
+                with gr.Column(scale=0.5):
+                    sample_method = gr.Dropdown(choices=["DDIM", "EulerDiscrete", "PNDM"], label="Sample Method", value="DDIM")
+            # with gr.Row():
+            #     with gr.Column(scale=1.0):
+            #         video_length = gr.Slider(
+            #             minimum=1,
+            #             maximum=24,
+            #             value=1,
+            #             step=1,
+            #             interactive=True,
+            #             label="Video Length (1 for T2I and 16 for T2V)",
+            #         )
+                with gr.Column(scale=0.5):
+                    video_length = gr.Dropdown(choices=[1, 16], label="Video Length (1 for T2I and 16 for T2V)", value=16)
+            with gr.Row():
+                with gr.Column(scale=1.0):
+                    scfg_scale = gr.Slider(
+                        minimum=1,
+                        maximum=50,
+                        value=7.5,
+                        step=0.1,
+                        interactive=True,
+                        label="Guidence Scale",
+                    )
+            with gr.Row():
+                with gr.Column(scale=1.0):
+                    seed = gr.Slider(
+                        minimum=1,
+                        maximum=2147483647,
+                        value=100,
+                        step=1,
+                        interactive=True,
+                        label="Seed",
+                    )
+            with gr.Row():
+                with gr.Column(scale=0.5):
+                    height = gr.Slider(
+                        minimum=256,
+                        maximum=768,
+                        value=512,
+                        step=16,
+                        interactive=False,
+                        label="Height",
+                    )
+            # with gr.Row():
+                with gr.Column(scale=0.5):
+                    width = gr.Slider(
+                        minimum=256,
+                        maximum=768,
+                        value=512,
+                        step=16,
+                        interactive=False,
+                        label="Width",
+                    )
+            with gr.Row():
+                with gr.Column(scale=1.0):
+                    diffusion_step = gr.Slider(
+                        minimum=20,
+                        maximum=250,
+                        value=50,
+                        step=1,
+                        interactive=True,
+                        label="Sampling Step",
+                    )
+        with gr.Column(scale=0.6, visible=True) as video_upload:
+        # with gr.Column(visible=True) as video_upload:
+            output = gr.Video(interactive=False, include_audio=True, elem_id="输出的视频") #.style(height=360)
+            # with gr.Column(elem_id="image", scale=0.5) as img_part:
+            #     with gr.Tab("Video", elem_id='video_tab'):
+            #     with gr.Tab("Image", elem_id='image_tab'):
+            #         up_image = gr.Image(type="pil", interactive=True, elem_id="image_upload").style(height=360)
+            # upload_button = gr.Button(value="Upload & Start Chat", interactive=True, variant="primary")
+            # clear = gr.Button("Restart")
+            with gr.Row():
+                with gr.Column(scale=1.0, min_width=0):
+                    run = gr.Button("💭Run")
+                # with gr.Column(scale=0.5, min_width=0):
+                #     clear = gr.Button("🔄Clear️")
+    run.click(gen_video, [text_input, sample_method, scfg_scale, seed, height, width, video_length, diffusion_step], [output])
+demo.launch(debug=False, share=True)
+# demo.launch(server_name="0.0.0.0", server_port=10034, enable_queue=True)

diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+from . import gaussian_diffusion as gd
+from .respace import SpacedDiffusion, space_timesteps
+def create_diffusion(
+    timestep_respacing,
+    noise_schedule="linear",
+    use_kl=False,
+    sigma_small=False,
+    predict_xstart=False,
+    learn_sigma=True,
+    # learn_sigma=False,
+    rescale_learned_sigmas=False,
+    diffusion_steps=1000
+):
+    betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
+    if use_kl:
+        loss_type = gd.LossType.RESCALED_KL
+    elif rescale_learned_sigmas:
+        loss_type = gd.LossType.RESCALED_MSE
+    else:
+        loss_type = gd.LossType.MSE
+    if timestep_respacing is None or timestep_respacing == "":
+        timestep_respacing = [diffusion_steps]
+    return SpacedDiffusion(
+        use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
+        betas=betas,
+        model_mean_type=(
+            gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
+        ),
+        model_var_type=(
+            (
+                gd.ModelVarType.FIXED_LARGE
+                if not sigma_small
+                else gd.ModelVarType.FIXED_SMALL
+            )
+            if not learn_sigma
+            else gd.ModelVarType.LEARNED_RANGE
+        ),
+        loss_type=loss_type
+        # rescale_timesteps=rescale_timesteps,
+    )

diffusion/diffusion_utils.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import torch as th
+import numpy as np
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, th.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for th.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + th.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
+    )
+def approx_standard_normal_cdf(x):
+    """
+    A fast approximation of the cumulative distribution function of the
+    standard normal.
+    """
+    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
+def continuous_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a continuous Gaussian distribution.
+    :param x: the targets
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    normalized_x = centered_x * inv_stdv
+    log_probs = th.distributions.Normal(th.zeros_like(x), th.ones_like(x)).log_prob(normalized_x)
+    return log_probs
+def discretized_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a Gaussian distribution discretizing to a
+    given image.
+    :param x: the target images. It is assumed that this was uint8 values,
+              rescaled to the range [-1, 1].
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    assert x.shape == means.shape == log_scales.shape
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
+    cdf_plus = approx_standard_normal_cdf(plus_in)
+    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
+    cdf_min = approx_standard_normal_cdf(min_in)
+    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = th.where(
+        x < -0.999,
+        log_cdf_plus,
+        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
+    )
+    assert log_probs.shape == x.shape
+    return log_probs

diffusion/gaussian_diffusion.py ADDED Viewed

	@@ -0,0 +1,881 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import math
+import numpy as np
+import torch as th
+import enum
+from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+class ModelMeanType(enum.Enum):
+    """
+    Which type of output the model predicts.
+    """
+    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
+    START_X = enum.auto()  # the model predicts x_0
+    EPSILON = enum.auto()  # the model predicts epsilon
+class ModelVarType(enum.Enum):
+    """
+    What is used as the model's output variance.
+    The LEARNED_RANGE option has been added to allow the model to predict
+    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
+    """
+    LEARNED = enum.auto()
+    FIXED_SMALL = enum.auto()
+    FIXED_LARGE = enum.auto()
+    LEARNED_RANGE = enum.auto()
+class LossType(enum.Enum):
+    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
+    RESCALED_MSE = (
+        enum.auto()
+    )  # use raw MSE loss (with RESCALED_KL when learning variances)
+    KL = enum.auto()  # use the variational lower-bound
+    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB
+    def is_vb(self):
+        return self == LossType.KL or self == LossType.RESCALED_KL
+def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
+    betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    warmup_time = int(num_diffusion_timesteps * warmup_frac)
+    betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
+    return betas
+def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
+    """
+    This is the deprecated API for creating beta schedules.
+    See get_named_beta_schedule() for the new library of schedules.
+    """
+    if beta_schedule == "quad":
+        betas = (
+            np.linspace(
+                beta_start ** 0.5,
+                beta_end ** 0.5,
+                num_diffusion_timesteps,
+                dtype=np.float64,
+            )
+            ** 2
+        )
+    elif beta_schedule == "linear":
+        betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "warmup10":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
+    elif beta_schedule == "warmup50":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
+    elif beta_schedule == "const":
+        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
+        betas = 1.0 / np.linspace(
+            num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64
+        )
+    else:
+        raise NotImplementedError(beta_schedule)
+    assert betas.shape == (num_diffusion_timesteps,)
+    return betas
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
+    """
+    Get a pre-defined beta schedule for the given name.
+    The beta schedule library consists of beta schedules which remain similar
+    in the limit of num_diffusion_timesteps.
+    Beta schedules may be added, but should not be removed or changed once
+    they are committed to maintain backwards compatibility.
+    """
+    if schedule_name == "linear":
+        # Linear schedule from Ho et al, extended to work for any number of
+        # diffusion steps.
+        scale = 1000 / num_diffusion_timesteps
+        return get_beta_schedule(
+            "linear",
+            beta_start=scale * 0.0001,
+            beta_end=scale * 0.02,
+            num_diffusion_timesteps=num_diffusion_timesteps,
+        )
+    elif schedule_name == "squaredcos_cap_v2":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    else:
+        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+class GaussianDiffusion:
+    """
+    Utilities for training and sampling diffusion models.
+    Original ported from this codebase:
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
+    :param betas: a 1-D numpy array of betas for each diffusion timestep,
+                  starting at T and going to 1.
+    """
+    def __init__(
+        self,
+        *,
+        betas,
+        model_mean_type,
+        model_var_type,
+        loss_type
+    ):
+        self.model_mean_type = model_mean_type
+        self.model_var_type = model_var_type
+        self.loss_type = loss_type
+        # Use float64 for accuracy.
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        assert len(betas.shape) == 1, "betas must be 1-D"
+        assert (betas > 0).all() and (betas <= 1).all()
+        self.num_timesteps = int(betas.shape[0])
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = (
+            betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.posterior_log_variance_clipped = np.log(
+            np.append(self.posterior_variance[1], self.posterior_variance[1:])
+        ) if len(self.posterior_variance) > 1 else np.array([])
+        self.posterior_mean_coef1 = (
+            betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        )
+        self.posterior_mean_coef2 = (
+            (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
+        )
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+        return mean, variance, log_variance
+    def q_sample(self, x_start, t, noise=None):
+        """
+        Diffuse the data for a given number of diffusion steps.
+        In other words, sample from q(x_t | x_0).
+        :param x_start: the initial data batch.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :param noise: if specified, the split-out normal noise.
+        :return: A noisy version of x_start.
+        """
+        if noise is None:
+            noise = th.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        return (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        """
+        Compute the mean and variance of the diffusion posterior:
+            q(x_{t-1} | x_t, x_0)
+        """
+        assert x_start.shape == x_t.shape
+        posterior_mean = (
+            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = _extract_into_tensor(
+            self.posterior_log_variance_clipped, t, x_t.shape
+        )
+        assert (
+            posterior_mean.shape[0]
+            == posterior_variance.shape[0]
+            == posterior_log_variance_clipped.shape[0]
+            == x_start.shape[0]
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+    def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None):
+        """
+        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
+        the initial x, x_0.
+        :param model: the model, which takes a signal and a batch of timesteps
+                      as input.
+        :param x: the [N x C x ...] tensor at time t.
+        :param t: a 1-D Tensor of timesteps.
+        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample. Applies before
+            clip_denoised.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict with the following keys:
+                 - 'mean': the model mean output.
+                 - 'variance': the model variance output.
+                 - 'log_variance': the log of 'variance'.
+                 - 'pred_xstart': the prediction for x_0.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        B, F, C = x.shape[:3]
+        assert t.shape == (B,)
+        model_output = model(x, t, **model_kwargs)
+        # try:
+        #     model_output = model_output.sample # for tav unet
+        # except:
+        #     model_output = model(x, t, **model_kwargs)
+        if isinstance(model_output, tuple):
+            model_output, extra = model_output
+        else:
+            extra = None
+        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
+            assert model_output.shape == (B, F, C * 2, *x.shape[3:])
+            model_output, model_var_values = th.split(model_output, C, dim=2)
+            min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
+            max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
+            # The model_var_values is [-1, 1] for [min_var, max_var].
+            frac = (model_var_values + 1) / 2
+            model_log_variance = frac * max_log + (1 - frac) * min_log
+            model_variance = th.exp(model_log_variance)
+        else:
+            model_variance, model_log_variance = {
+                # for fixedlarge, we set the initial (log-)variance like so
+                # to get a better decoder log likelihood.
+                ModelVarType.FIXED_LARGE: (
+                    np.append(self.posterior_variance[1], self.betas[1:]),
+                    np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+                ),
+                ModelVarType.FIXED_SMALL: (
+                    self.posterior_variance,
+                    self.posterior_log_variance_clipped,
+                ),
+            }[self.model_var_type]
+            model_variance = _extract_into_tensor(model_variance, t, x.shape)
+            model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
+        def process_xstart(x):
+            if denoised_fn is not None:
+                x = denoised_fn(x)
+            if clip_denoised:
+                return x.clamp(-1, 1)
+            return x
+        if self.model_mean_type == ModelMeanType.START_X:
+            pred_xstart = process_xstart(model_output)
+        else:
+            pred_xstart = process_xstart(
+                self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
+            )
+        model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)
+        assert model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
+        return {
+            "mean": model_mean,
+            "variance": model_variance,
+            "log_variance": model_log_variance,
+            "pred_xstart": pred_xstart,
+            "extra": extra,
+        }
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+        )
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute the mean for the previous step, given a function cond_fn that
+        computes the gradient of a conditional log probability with respect to
+        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+        condition on y.
+        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+        """
+        gradient = cond_fn(x, t, **model_kwargs)
+        new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
+        return new_mean
+    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute what the p_mean_variance output would have been, should the
+        model's score function be conditioned by cond_fn.
+        See condition_mean() for details on cond_fn.
+        Unlike condition_mean(), this instead uses the conditioning strategy
+        from Song et al (2020).
+        """
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)
+        out = p_mean_var.copy()
+        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
+        out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
+        return out
+    def p_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        noise = th.randn_like(x)
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        if cond_fn is not None:
+            out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def p_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model.
+        :param model: the model module.
+        :param shape: the shape of the samples, (N, C, H, W).
+        :param noise: if specified, the noise from the encoder to sample.
+                      Should be of the same shape as `shape`.
+        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param device: if specified, the device to create the samples on.
+                       If not specified, use a model parameter's device.
+        :param progress: if True, show a tqdm progress bar.
+        :return: a non-differentiable batch of samples.
+        """
+        final = None
+        for sample in self.p_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+        ):
+            final = sample
+        return final["sample"]
+    def p_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.p_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                )
+                yield out
+                img = out["sample"]
+    def ddim_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t-1} from the model using DDIM.
+        Same usage as p_sample().
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = (
+            eta
+            * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
+            * th.sqrt(1 - alpha_bar / alpha_bar_prev)
+        )
+        # Equation 12.
+        noise = th.randn_like(x)
+        mean_pred = (
+            out["pred_xstart"] * th.sqrt(alpha_bar_prev)
+            + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
+        )
+        nonzero_mask = (
+            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
+        )  # no noise when t == 0
+        sample = mean_pred + nonzero_mask * sigma * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+    def ddim_reverse_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t+1} from the model using DDIM reverse ODE.
+        """
+        assert eta == 0.0, "Reverse ODE only for deterministic path"
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
+            - out["pred_xstart"]
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
+        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
+        # Equation 12. reversed
+        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps
+        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
+    def ddim_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Generate samples from the model using DDIM.
+        Same usage as p_sample_loop().
+        """
+        final = None
+        for sample in self.ddim_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            eta=eta,
+        ):
+            final = sample
+        return final["sample"]
+    def ddim_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Use DDIM to sample from the model and yield intermediate samples from
+        each timestep of DDIM.
+        Same usage as p_sample_loop_progressive().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+            indices = tqdm(indices)
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.ddim_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    eta=eta,
+                )
+                yield out
+                img = out["sample"]
+    def _vb_terms_bpd(
+            self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None
+    ):
+        """
+        Get a term for the variational lower-bound.
+        The resulting units are bits (rather than nats, as one might expect).
+        This allows for comparison to other papers.
+        :return: a dict with the following keys:
+                 - 'output': a shape [N] tensor of NLLs or KLs.
+                 - 'pred_xstart': the x_0 predictions.
+        """
+        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
+            x_start=x_start, x_t=x_t, t=t
+        )
+        out = self.p_mean_variance(
+            model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
+        )
+        kl = normal_kl(
+            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
+        )
+        kl = mean_flat(kl) / np.log(2.0)
+        decoder_nll = -discretized_gaussian_log_likelihood(
+            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
+        )
+        assert decoder_nll.shape == x_start.shape
+        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
+        # At the first timestep return the decoder NLL,
+        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
+        output = th.where((t == 0), decoder_nll, kl)
+        return {"output": output, "pred_xstart": out["pred_xstart"]}
+    def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
+        """
+        Compute training losses for a single timestep.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param t: a batch of timestep indices.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param noise: if specified, the specific Gaussian noise to try to remove.
+        :return: a dict with the key "loss" containing a tensor of shape [N].
+                 Some mean or variance settings may also have other keys.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        if noise is None:
+            noise = th.randn_like(x_start)
+        x_t = self.q_sample(x_start, t, noise=noise)
+        terms = {}
+        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
+            terms["loss"] = self._vb_terms_bpd(
+                model=model,
+                x_start=x_start,
+                x_t=x_t,
+                t=t,
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+            )["output"]
+            if self.loss_type == LossType.RESCALED_KL:
+                terms["loss"] *= self.num_timesteps
+        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
+            model_output = model(x_t, t, **model_kwargs)
+            # try:
+            #     model_output = model(x_t, t, **model_kwargs).sample # for tav unet
+            # except:
+            #     model_output = model(x_t, t, **model_kwargs)
+            if self.model_var_type in [
+                ModelVarType.LEARNED,
+                ModelVarType.LEARNED_RANGE,
+            ]:
+                B, F, C = x_t.shape[:3]
+                assert model_output.shape == (B, F, C * 2, *x_t.shape[3:])
+                model_output, model_var_values = th.split(model_output, C, dim=2)
+                # Learn the variance using the variational bound, but don't let
+                # it affect our mean prediction.
+                frozen_out = th.cat([model_output.detach(), model_var_values], dim=2)
+                terms["vb"] = self._vb_terms_bpd(
+                    model=lambda *args, r=frozen_out: r,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t,
+                    clip_denoised=False,
+                )["output"]
+                if self.loss_type == LossType.RESCALED_MSE:
+                    # Divide by 1000 for equivalence with initial implementation.
+                    # Without a factor of 1/1000, the VB term hurts the MSE term.
+                    terms["vb"] *= self.num_timesteps / 1000.0
+            target = {
+                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
+                    x_start=x_start, x_t=x_t, t=t
+                )[0],
+                ModelMeanType.START_X: x_start,
+                ModelMeanType.EPSILON: noise,
+            }[self.model_mean_type]
+            assert model_output.shape == target.shape == x_start.shape
+            terms["mse"] = mean_flat((target - model_output) ** 2)
+            if "vb" in terms:
+                terms["loss"] = terms["mse"] + terms["vb"]
+            else:
+                terms["loss"] = terms["mse"]
+        else:
+            raise NotImplementedError(self.loss_type)
+        return terms
+    def _prior_bpd(self, x_start):
+        """
+        Get the prior KL term for the variational lower-bound, measured in
+        bits-per-dim.
+        This term can't be optimized, as it only depends on the encoder.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :return: a batch of [N] KL values (in bits), one per batch element.
+        """
+        batch_size = x_start.shape[0]
+        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
+        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+        kl_prior = normal_kl(
+            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
+        )
+        return mean_flat(kl_prior) / np.log(2.0)
+    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
+        """
+        Compute the entire variational lower-bound, measured in bits-per-dim,
+        as well as other related quantities.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param clip_denoised: if True, clip denoised samples.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - total_bpd: the total variational lower-bound, per batch element.
+                 - prior_bpd: the prior term in the lower-bound.
+                 - vb: an [N x T] tensor of terms in the lower-bound.
+                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
+                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
+        """
+        device = x_start.device
+        batch_size = x_start.shape[0]
+        vb = []
+        xstart_mse = []
+        mse = []
+        for t in list(range(self.num_timesteps))[::-1]:
+            t_batch = th.tensor([t] * batch_size, device=device)
+            noise = th.randn_like(x_start)
+            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
+            # Calculate VLB term at the current timestep
+            with th.no_grad():
+                out = self._vb_terms_bpd(
+                    model,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t_batch,
+                    clip_denoised=clip_denoised,
+                    model_kwargs=model_kwargs,
+                )
+            vb.append(out["output"])
+            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
+            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
+            mse.append(mean_flat((eps - noise) ** 2))
+        vb = th.stack(vb, dim=1)
+        xstart_mse = th.stack(xstart_mse, dim=1)
+        mse = th.stack(mse, dim=1)
+        prior_bpd = self._prior_bpd(x_start)
+        total_bpd = vb.sum(dim=1) + prior_bpd
+        return {
+            "total_bpd": total_bpd,
+            "prior_bpd": prior_bpd,
+            "vb": vb,
+            "xstart_mse": xstart_mse,
+            "mse": mse,
+        }
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res + th.zeros(broadcast_shape, device=timesteps.device)

diffusion/respace.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+import torch
+import numpy as np
+import torch as th
+from .gaussian_diffusion import GaussianDiffusion
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    if isinstance(section_counts, str):
+        if section_counts.startswith("ddim"):
+            desired_count = int(section_counts[len("ddim") :])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(
+                f"cannot create exactly {num_timesteps} steps with an integer stride"
+            )
+        section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(
+                f"cannot divide section of {size} steps into {section_count}"
+            )
+        if section_count <= 1:
+            frac_stride = 1
+        else:
+            frac_stride = (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+class SpacedDiffusion(GaussianDiffusion):
+    """
+    A diffusion process which can skip steps in a base diffusion process.
+    :param use_timesteps: a collection (sequence or set) of timesteps from the
+                          original diffusion process to retain.
+    :param kwargs: the kwargs to create the base diffusion process.
+    """
+    def __init__(self, use_timesteps, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs["betas"])
+        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs["betas"] = np.array(new_betas)
+        super().__init__(**kwargs)
+    def p_mean_variance(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+    # @torch.compile
+    def training_losses(
+        self, model, *args, **kwargs
+    ):  # pylint: disable=signature-differs
+        return super().training_losses(self._wrap_model(model), *args, **kwargs)
+    def condition_mean(self, cond_fn, *args, **kwargs):
+        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
+    def condition_score(self, cond_fn, *args, **kwargs):
+        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
+    def _wrap_model(self, model):
+        if isinstance(model, _WrappedModel):
+            return model
+        return _WrappedModel(
+            model, self.timestep_map, self.original_num_steps
+        )
+    def _scale_timesteps(self, t):
+        # Scaling is done by the wrapped model.
+        return t
+class _WrappedModel:
+    def __init__(self, model, timestep_map, original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        # self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+    def __call__(self, x, ts, **kwargs):
+        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+        new_ts = map_tensor[ts]
+        # if self.rescale_timesteps:
+        #     new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x, new_ts, **kwargs)

diffusion/timestep_sampler.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+from abc import ABC, abstractmethod
+import numpy as np
+import torch as th
+import torch.distributed as dist
+def create_named_schedule_sampler(name, diffusion):
+    """
+    Create a ScheduleSampler from a library of pre-defined samplers.
+    :param name: the name of the sampler.
+    :param diffusion: the diffusion object to sample for.
+    """
+    if name == "uniform":
+        return UniformSampler(diffusion)
+    elif name == "loss-second-moment":
+        return LossSecondMomentResampler(diffusion)
+    else:
+        raise NotImplementedError(f"unknown schedule sampler: {name}")
+class ScheduleSampler(ABC):
+    """
+    A distribution over timesteps in the diffusion process, intended to reduce
+    variance of the objective.
+    By default, samplers perform unbiased importance sampling, in which the
+    objective's mean is unchanged.
+    However, subclasses may override sample() to change how the resampled
+    terms are reweighted, allowing for actual changes in the objective.
+    """
+    @abstractmethod
+    def weights(self):
+        """
+        Get a numpy array of weights, one per diffusion step.
+        The weights needn't be normalized, but must be positive.
+        """
+    def sample(self, batch_size, device):
+        """
+        Importance-sample timesteps for a batch.
+        :param batch_size: the number of timesteps.
+        :param device: the torch device to save to.
+        :return: a tuple (timesteps, weights):
+                 - timesteps: a tensor of timestep indices.
+                 - weights: a tensor of weights to scale the resulting losses.
+        """
+        w = self.weights()
+        p = w / np.sum(w)
+        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
+        indices = th.from_numpy(indices_np).long().to(device)
+        weights_np = 1 / (len(p) * p[indices_np])
+        weights = th.from_numpy(weights_np).float().to(device)
+        return indices, weights
+class UniformSampler(ScheduleSampler):
+    def __init__(self, diffusion):
+        self.diffusion = diffusion
+        self._weights = np.ones([diffusion.num_timesteps])
+    def weights(self):
+        return self._weights
+class LossAwareSampler(ScheduleSampler):
+    def update_with_local_losses(self, local_ts, local_losses):
+        """
+        Update the reweighting using losses from a model.
+        Call this method from each rank with a batch of timesteps and the
+        corresponding losses for each of those timesteps.
+        This method will perform synchronization to make sure all of the ranks
+        maintain the exact same reweighting.
+        :param local_ts: an integer Tensor of timesteps.
+        :param local_losses: a 1D Tensor of losses.
+        """
+        batch_sizes = [
+            th.tensor([0], dtype=th.int32, device=local_ts.device)
+            for _ in range(dist.get_world_size())
+        ]
+        dist.all_gather(
+            batch_sizes,
+            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
+        )
+        # Pad all_gather batches to be the maximum batch size.
+        batch_sizes = [x.item() for x in batch_sizes]
+        max_bs = max(batch_sizes)
+        timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
+        loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
+        dist.all_gather(timestep_batches, local_ts)
+        dist.all_gather(loss_batches, local_losses)
+        timesteps = [
+            x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]
+        ]
+        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
+        self.update_with_all_losses(timesteps, losses)
+    @abstractmethod
+    def update_with_all_losses(self, ts, losses):
+        """
+        Update the reweighting using losses from a model.
+        Sub-classes should override this method to update the reweighting
+        using losses from the model.
+        This method directly updates the reweighting without synchronizing
+        between workers. It is called by update_with_local_losses from all
+        ranks with identical arguments. Thus, it should have deterministic
+        behavior to maintain state across workers.
+        :param ts: a list of int timesteps.
+        :param losses: a list of float losses, one per timestep.
+        """
+class LossSecondMomentResampler(LossAwareSampler):
+    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
+        self.diffusion = diffusion
+        self.history_per_term = history_per_term
+        self.uniform_prob = uniform_prob
+        self._loss_history = np.zeros(
+            [diffusion.num_timesteps, history_per_term], dtype=np.float64
+        )
+        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
+    def weights(self):
+        if not self._warmed_up():
+            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
+        weights = np.sqrt(np.mean(self._loss_history ** 2, axis=-1))
+        weights /= np.sum(weights)
+        weights *= 1 - self.uniform_prob
+        weights += self.uniform_prob / len(weights)
+        return weights
+    def update_with_all_losses(self, ts, losses):
+        for t, loss in zip(ts, losses):
+            if self._loss_counts[t] == self.history_per_term:
+                # Shift out the oldest loss term.
+                self._loss_history[t, :-1] = self._loss_history[t, 1:]
+                self._loss_history[t, -1] = loss
+            else:
+                self._loss_history[t, self._loss_counts[t]] = loss
+                self._loss_counts[t] += 1
+    def _warmed_up(self):
+        return (self._loss_counts == self.history_per_term).all()

docs/datasets_evaluation.md ADDED Viewed

	@@ -0,0 +1,53 @@

+## Download datasets
+Here are the links to download the datasets [FaceForensics](https://huggingface.co/datasets/maxin-cn/FaceForensics), [SkyTimelapse](https://huggingface.co/datasets/maxin-cn/SkyTimelapse/tree/main), [UCF101](https://www.crcv.ucf.edu/data/UCF101/UCF101.rar), and [Taichi-HD](https://huggingface.co/datasets/maxin-cn/Taichi-HD).
+## Dataset structure
+All datasets follow their original dataset structure. As for video-image joint training, there is a `train_list.txt` file, whose format is `video_name/frame.jpg`. Here, we show an example of the FaceForensics datsset.
+All datasets retain their original structure. For video-image joint training, there is a `train_list.txt` file formatted as `video_name/frame.jpg`. Below is an example from the FaceForensics dataset.
+```bash
+aS62n5PdTIU_1_8WGsQ0Y7uyU_1/000306.jpg
+aS62n5PdTIU_1_8WGsQ0Y7uyU_1/000111.jpg
+aS62n5PdTIU_1_8WGsQ0Y7uyU_1/000007.jpg
+aS62n5PdTIU_1_8WGsQ0Y7uyU_1/000057.jpg
+aS62n5PdTIU_1_8WGsQ0Y7uyU_1/000084.jpg
+aS62n5PdTIU_1_8WGsQ0Y7uyU_1/000268.jpg
+aS62n5PdTIU_1_8WGsQ0Y7uyU_1/000270.jpg
+aS62n5PdTIU_1_8WGsQ0Y7uyU_1/000259.jpg
+aS62n5PdTIU_1_8WGsQ0Y7uyU_1/000127.jpg
+aS62n5PdTIU_1_8WGsQ0Y7uyU_1/000099.jpg
+aS62n5PdTIU_1_8WGsQ0Y7uyU_1/000189.jpg
+aS62n5PdTIU_1_8WGsQ0Y7uyU_1/000228.jpg
+aS62n5PdTIU_1_8WGsQ0Y7uyU_1/000026.jpg
+aS62n5PdTIU_1_8WGsQ0Y7uyU_1/000081.jpg
+aS62n5PdTIU_1_8WGsQ0Y7uyU_1/000094.jpg
+aS62n5PdTIU_1_8WGsQ0Y7uyU_1/000223.jpg
+aS62n5PdTIU_1_8WGsQ0Y7uyU_1/000055.jpg
+qEnKi82wWgE_2_rJPM8EdWShs_1/000486.jpg
+qEnKi82wWgE_2_rJPM8EdWShs_1/000396.jpg
+qEnKi82wWgE_2_rJPM8EdWShs_1/000475.jpg
+qEnKi82wWgE_2_rJPM8EdWShs_1/000028.jpg
+qEnKi82wWgE_2_rJPM8EdWShs_1/000261.jpg
+qEnKi82wWgE_2_rJPM8EdWShs_1/000294.jpg
+qEnKi82wWgE_2_rJPM8EdWShs_1/000257.jpg
+qEnKi82wWgE_2_rJPM8EdWShs_1/000490.jpg
+qEnKi82wWgE_2_rJPM8EdWShs_1/000143.jpg
+qEnKi82wWgE_2_rJPM8EdWShs_1/000190.jpg
+qEnKi82wWgE_2_rJPM8EdWShs_1/000476.jpg
+qEnKi82wWgE_2_rJPM8EdWShs_1/000397.jpg
+qEnKi82wWgE_2_rJPM8EdWShs_1/000437.jpg
+qEnKi82wWgE_2_rJPM8EdWShs_1/000071.jpg
+```
+## Evaluation
+We follow [StyleGAN-V](https://github.com/universome/stylegan-v) to measure the quality of the generated video. The code for calculating the relevant metrics is located in [tools](../tools/) folder. To measure the quantitative metrics of your generated results, you need to put all the videos from real data into a folder and turn them into video frames (the same goes for fake data). Then you can run the following command on one GPU:
+```bash
+# cd Latte
+bash tools/eval_metrics.sh
+```

docs/latte_diffusers.md ADDED Viewed

	@@ -0,0 +1,106 @@

+## Requirements
+Please follow [README](../README.md) to install the environment. After installation, update the version of `diffusers` at leaset to 0.30.0.
+## Inference
+```bash
+from diffusers import LattePipeline
+from diffusers.models import AutoencoderKLTemporalDecoder
+from torchvision.utils import save_image
+import torch
+import imageio
+torch.manual_seed(0)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+video_length = 1 # 1 or 16
+pipe = LattePipeline.from_pretrained("maxin-cn/Latte-1", torch_dtype=torch.float16).to(device)
+# if you want to use the temporal decoder of VAE, please uncomment the following codes
+# vae = AutoencoderKLTemporalDecoder.from_pretrained("maxin-cn/Latte-1", subfolder="vae_temporal_decoder", torch_dtype=torch.float16).to(device)
+# pipe.vae = vae
+prompt = "a cat wearing sunglasses and working as a lifeguard at pool."
+videos = pipe(prompt, video_length=video_length, output_type='pt').frames.cpu()
+if video_length > 1:
+    videos = (videos.clamp(0, 1) * 255).to(dtype=torch.uint8) # convert to uint8
+    imageio.mimwrite('./latte_output.mp4', videos[0].permute(0, 2, 3, 1), fps=8, quality=5) # highest quality is 10, lowest is 0
+else:
+    save_image(videos[0], './latte_output.png')
+```
+## Inference with 4/8-bit quantization
+[@Aryan](https://github.com/a-r-r-o-w) provides a quantization solution for inference, which can reduce GPU memory from 17 GB to 9 GB. Note that please install `bitsandbytes` (`pip install bitsandbytes`).
+```bash
+import gc
+import torch
+from diffusers import LattePipeline
+from transformers import T5EncoderModel, BitsAndBytesConfig
+import imageio
+from torchvision.utils import save_image
+torch.manual_seed(0)
+def flush():
+    gc.collect()
+    torch.cuda.empty_cache()
+def bytes_to_giga_bytes(bytes):
+    return bytes / 1024 / 1024 / 1024
+video_length = 16
+model_id = "maxin-cn/Latte-1/"
+text_encoder = T5EncoderModel.from_pretrained(
+    model_id,
+    subfolder="text_encoder",
+    quantization_config=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16),
+    device_map="auto",
+)
+pipe = LattePipeline.from_pretrained(
+    model_id,
+    text_encoder=text_encoder,
+    transformer=None,
+    device_map="balanced",
+)
+with torch.no_grad():
+    prompt = "a cat wearing sunglasses and working as a lifeguard at pool."
+    negative_prompt = ""
+    prompt_embeds, negative_prompt_embeds = pipe.encode_prompt(prompt, negative_prompt=negative_prompt)
+del text_encoder
+del pipe
+flush()
+pipe = LattePipeline.from_pretrained(
+    model_id,
+    text_encoder=None,
+    torch_dtype=torch.float16,
+).to("cuda")
+# pipe.enable_vae_tiling()
+# pipe.enable_vae_slicing()
+videos = pipe(
+    video_length=video_length,
+    num_inference_steps=50,
+    negative_prompt=None,
+    prompt_embeds=prompt_embeds,
+    negative_prompt_embeds=negative_prompt_embeds,
+    output_type="pt",
+).frames.cpu()
+print(f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB")
+if video_length > 1:
+    videos = (videos.clamp(0, 1) * 255).to(dtype=torch.uint8) # convert to uint8
+    imageio.mimwrite('./latte_output.mp4', videos[0].permute(0, 2, 3, 1), fps=8, quality=5) # highest quality is 10, lowest is 0
+else:
+    save_image(videos[0], './latte_output.png')
+```

environment.yml ADDED Viewed

	@@ -0,0 +1,25 @@

+name: latte
+channels:
+  - pytorch
+  - nvidia
+dependencies:
+  - python >= 3.10
+  - pytorch > 2.0.0
+  - torchvision
+  - pytorch-cuda >= 11.7
+  - pip:
+    - timm
+    - diffusers[torch]==0.24.0
+    - accelerate
+    - tensorboard
+    - einops
+    - transformers
+    - av
+    - scikit-image
+    - decord
+    - pandas
+    - imageio-ffmpeg
+    - sentencepiece
+    - beautifulsoup4
+    - ftfy
+    - omegaconf

models/__init__.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import os
+import sys
+sys.path.append(os.path.split(sys.path[0])[0])
+from .latte import Latte_models
+from .latte_img import LatteIMG_models
+from .latte_t2v import LatteT2V
+from torch.optim.lr_scheduler import LambdaLR
+def customized_lr_scheduler(optimizer, warmup_steps=5000): # 5000 from u-vit
+    from torch.optim.lr_scheduler import LambdaLR
+    def fn(step):
+        if warmup_steps > 0:
+            return min(step / warmup_steps, 1)
+        else:
+            return 1
+    return LambdaLR(optimizer, fn)
+def get_lr_scheduler(optimizer, name, **kwargs):
+    if name == 'warmup':
+        return customized_lr_scheduler(optimizer, **kwargs)
+    elif name == 'cosine':
+        from torch.optim.lr_scheduler import CosineAnnealingLR
+        return CosineAnnealingLR(optimizer, **kwargs)
+    else:
+        raise NotImplementedError(name)
+def get_models(args):
+    if 'LatteIMG' in args.model:
+        return LatteIMG_models[args.model](
+                input_size=args.latent_size,
+                num_classes=args.num_classes,
+                num_frames=args.num_frames,
+                learn_sigma=args.learn_sigma,
+                extras=args.extras
+            )
+    elif 'LatteT2V' in args.model:
+        return LatteT2V.from_pretrained(args.pretrained_model_path, subfolder="transformer", video_length=args.video_length)
+    elif 'Latte' in args.model:
+        return Latte_models[args.model](
+                input_size=args.latent_size,
+                num_classes=args.num_classes,
+                num_frames=args.num_frames,
+                learn_sigma=args.learn_sigma,
+                extras=args.extras
+            )
+    else:
+        raise '{} Model Not Supported!'.format(args.model)

models/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (2.54 kB). View file

models/__pycache__/latte.cpython-312.pyc ADDED Viewed

Binary file (28.8 kB). View file

models/__pycache__/latte_img.cpython-312.pyc ADDED Viewed

Binary file (30.3 kB). View file

models/__pycache__/latte_t2v.cpython-312.pyc ADDED Viewed

Binary file (39 kB). View file

models/clip.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import numpy
+import torch.nn as nn
+from transformers import CLIPTokenizer, CLIPTextModel, CLIPImageProcessor
+import transformers
+transformers.logging.set_verbosity_error()
+"""
+Will encounter following warning:
+- This IS expected if you are initializing CLIPTextModel from the checkpoint of a model trained on another task
+or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+- This IS NOT expected if you are initializing CLIPTextModel from the checkpoint of a model
+that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+https://github.com/CompVis/stable-diffusion/issues/97
+according to this issue, this warning is safe.
+This is expected since the vision backbone of the CLIP model is not needed to run Stable Diffusion.
+You can safely ignore the warning, it is not an error.
+This clip usage is from U-ViT and same with Stable Diffusion.
+"""
+class AbstractEncoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def encode(self, *args, **kwargs):
+        raise NotImplementedError
+class FrozenCLIPEmbedder(AbstractEncoder):
+    """Uses the CLIP transformer encoder for text (from Hugging Face)"""
+    # def __init__(self, version="openai/clip-vit-huge-patch14", device="cuda", max_length=77):
+    def __init__(self, path, device="cuda", max_length=77):
+        super().__init__()
+        self.tokenizer = CLIPTokenizer.from_pretrained(path, subfolder="tokenizer")
+        self.transformer = CLIPTextModel.from_pretrained(path, subfolder='text_encoder')
+        self.device = device
+        self.max_length = max_length
+        self.freeze()
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, text):
+        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
+                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+        tokens = batch_encoding["input_ids"].to(self.device)
+        outputs = self.transformer(input_ids=tokens)
+        z = outputs.last_hidden_state
+        pooled_z = outputs.pooler_output
+        return z, pooled_z
+    def encode(self, text):
+        return self(text)
+class TextEmbedder(nn.Module):
+    """
+    Embeds text prompt into vector representations. Also handles text dropout for classifier-free guidance.
+    """
+    def __init__(self, path, dropout_prob=0.1):
+        super().__init__()
+        self.text_encodder = FrozenCLIPEmbedder(path=path)
+        self.dropout_prob = dropout_prob
+    def token_drop(self, text_prompts, force_drop_ids=None):
+        """
+        Drops text to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = numpy.random.uniform(0, 1, len(text_prompts)) < self.dropout_prob
+        else:
+            # TODO
+            drop_ids = force_drop_ids == 1
+        labels = list(numpy.where(drop_ids, "", text_prompts))
+        # print(labels)
+        return labels
+    def forward(self, text_prompts, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            text_prompts = self.token_drop(text_prompts, force_drop_ids)
+        embeddings, pooled_embeddings = self.text_encodder(text_prompts)
+        # return embeddings, pooled_embeddings
+        return pooled_embeddings
+if __name__ == '__main__':
+    r"""
+    Returns:
+    Examples from CLIPTextModel:
+    ```python
+    >>> from transformers import AutoTokenizer, CLIPTextModel
+    >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+    >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+    >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+    >>> outputs = model(**inputs)
+    >>> last_hidden_state = outputs.last_hidden_state
+    >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+    ```"""
+    import torch
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    text_encoder = TextEmbedder(path='/mnt/petrelfs/maxin/work/pretrained/stable-diffusion-2-1-base',
+                                dropout_prob=0.00001).to(device)
+    text_prompt = [["a photo of a cat", "a photo of a cat"], ["a photo of a dog", "a photo of a cat"], ['a photo of a dog human', "a photo of a cat"]]
+    # text_prompt = ('None', 'None', 'None')
+    output, pooled_output = text_encoder(text_prompts=text_prompt, train=False)
+    # print(output)
+    print(output.shape)
+    print(pooled_output.shape)
+    # print(output.shape)

models/latte.py ADDED Viewed

	@@ -0,0 +1,526 @@

+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import math
+import torch
+import torch.nn as nn
+import numpy as np
+from einops import rearrange, repeat
+from timm.models.vision_transformer import Mlp, PatchEmbed
+# the xformers lib allows less memory, faster training and inference
+try:
+    import xformers
+    import xformers.ops
+except:
+    XFORMERS_IS_AVAILBLE = False
+# from timm.models.layers.helpers import to_2tuple
+# from timm.models.layers.trace_utils import _assert
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+#################################################################################
+#               Attention Layers from TIMM                                      #
+#################################################################################
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0., use_lora=False, attention_mode='math'):
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.attention_mode = attention_mode
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4).contiguous()
+        q, k, v = qkv.unbind(0)   # make torchscript happy (cannot use tensor as tuple)
+        if self.attention_mode == 'xformers': # cause loss nan while using with amp
+            # https://github.com/facebookresearch/xformers/blob/e8bd8f932c2f48e3a3171d06749eecbbf1de420c/xformers/ops/fmha/__init__.py#L135
+            q_xf = q.transpose(1,2).contiguous()
+            k_xf = k.transpose(1,2).contiguous()
+            v_xf = v.transpose(1,2).contiguous()
+            x = xformers.ops.memory_efficient_attention(q_xf, k_xf, v_xf).reshape(B, N, C)
+        elif self.attention_mode == 'flash':
+            # cause loss nan while using with amp
+            # Optionally use the context manager to ensure one of the fused kerenels is run
+            with torch.backends.cuda.sdp_kernel(enable_math=False):
+                x = torch.nn.functional.scaled_dot_product_attention(q, k, v).reshape(B, N, C) # require pytorch 2.0
+        elif self.attention_mode == 'math':
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        else:
+            raise NotImplemented
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t, use_fp16=False):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        if use_fp16:
+            t_freq = t_freq.to(dtype=torch.float16)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+#################################################################################
+#                                 Core Latte Model                                #
+#################################################################################
+class TransformerBlock(nn.Module):
+    """
+    A Latte tansformer block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
+        x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of Latte.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class Latte(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        input_size=32,
+        patch_size=2,
+        in_channels=4,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        num_frames=16,
+        class_dropout_prob=0.1,
+        num_classes=1000,
+        learn_sigma=True,
+        extras=1,
+        attention_mode='math',
+    ):
+        super().__init__()
+        self.learn_sigma = learn_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.extras = extras
+        self.num_frames = num_frames
+        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        if self.extras == 2:
+            self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+        if self.extras == 78: # timestep + text_embedding
+            self.text_embedding_projection = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(77 * 768, hidden_size, bias=True)
+        )
+        num_patches = self.x_embedder.num_patches
+        # Will use fixed sin-cos embedding:
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, hidden_size), requires_grad=False)
+        self.temp_embed = nn.Parameter(torch.zeros(1, num_frames, hidden_size), requires_grad=False)
+        self.hidden_size =  hidden_size
+        self.blocks = nn.ModuleList([
+            TransformerBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, attention_mode=attention_mode) for _ in range(depth)
+        ])
+        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize (and freeze) pos_embed by sin-cos embedding:
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5))
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        temp_embed = get_1d_sincos_temp_embed(self.temp_embed.shape[-1], self.temp_embed.shape[-2])
+        self.temp_embed.data.copy_(torch.from_numpy(temp_embed).float().unsqueeze(0))
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        if self.extras == 2:
+            # Initialize label embedding table:
+            nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in Latte blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
+        return imgs
+    # @torch.cuda.amp.autocast()
+    # @torch.compile
+    def forward(self,
+                x,
+                t,
+                y=None,
+                text_embedding=None,
+                use_fp16=False):
+        """
+        Forward pass of Latte.
+        x: (N, F, C, H, W) tensor of video inputs
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of class labels
+        """
+        if use_fp16:
+            x = x.to(dtype=torch.float16)
+        batches, frames, channels, high, weight = x.shape
+        x = rearrange(x, 'b f c h w -> (b f) c h w')
+        x = self.x_embedder(x) + self.pos_embed
+        t = self.t_embedder(t, use_fp16=use_fp16)
+        timestep_spatial = repeat(t, 'n d -> (n c) d', c=self.temp_embed.shape[1])
+        timestep_temp = repeat(t, 'n d -> (n c) d', c=self.pos_embed.shape[1])
+        if self.extras == 2:
+            y = self.y_embedder(y, self.training)
+            y_spatial = repeat(y, 'n d -> (n c) d', c=self.temp_embed.shape[1])
+            y_temp = repeat(y, 'n d -> (n c) d', c=self.pos_embed.shape[1])
+        elif self.extras == 78:
+            text_embedding = self.text_embedding_projection(text_embedding.reshape(batches, -1))
+            text_embedding_spatial = repeat(text_embedding, 'n d -> (n c) d', c=self.temp_embed.shape[1])
+            text_embedding_temp = repeat(text_embedding, 'n d -> (n c) d', c=self.pos_embed.shape[1])
+        for i in range(0, len(self.blocks), 2):
+            spatial_block, temp_block = self.blocks[i:i+2]
+            if self.extras == 2:
+                c = timestep_spatial + y_spatial
+            elif self.extras == 78:
+                c = timestep_spatial + text_embedding_spatial
+            else:
+                c = timestep_spatial
+            x  = spatial_block(x, c)
+            x = rearrange(x, '(b f) t d -> (b t) f d', b=batches)
+            # Add Time Embedding
+            if i == 0:
+                x = x + self.temp_embed
+            if self.extras == 2:
+                c = timestep_temp + y_temp
+            elif self.extras == 78:
+                c = timestep_temp + text_embedding_temp
+            else:
+                c = timestep_temp
+            x = temp_block(x, c)
+            x = rearrange(x, '(b t) f d -> (b f) t d', b=batches)
+        if self.extras == 2:
+            c = timestep_spatial + y_spatial
+        else:
+            c = timestep_spatial
+        x = self.final_layer(x, c)
+        x = self.unpatchify(x)
+        x = rearrange(x, '(b f) c h w -> b f c h w', b=batches)
+        return x
+    def forward_with_cfg(self, x, t, y=None, cfg_scale=7.0, use_fp16=False, text_embedding=None):
+        """
+        Forward pass of Latte, but also batches the unconditional forward pass for classifier-free guidance.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        if use_fp16:
+            combined = combined.to(dtype=torch.float16)
+        model_out = self.forward(combined, t, y=y, use_fp16=use_fp16, text_embedding=text_embedding)
+        # For exact reproducibility reasons, we apply classifier-free guidance on only
+        # three channels by default. The standard approach to cfg applies it to all channels.
+        # This can be done by uncommenting the following line and commenting-out the line following that.
+        # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+        # eps, rest = model_out[:, :3], model_out[:, 3:]
+        eps, rest = model_out[:, :, :4, ...], model_out[:, :, 4:, ...]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=2)
+#################################################################################
+#                   Sine/Cosine Positional Embedding Functions                  #
+#################################################################################
+# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+def get_1d_sincos_temp_embed(embed_dim, length):
+    pos = torch.arange(0, length).unsqueeze(1)
+    return get_1d_sincos_pos_embed_from_grid(embed_dim, pos)
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])
+    emb = np.concatenate([emb_h, emb_w], axis=1)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega
+    pos = pos.reshape(-1)
+    out = np.einsum('m,d->md', pos, omega)
+    emb_sin = np.sin(out)
+    emb_cos = np.cos(out)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)
+    return emb
+#################################################################################
+#                                   Latte Configs                                  #
+#################################################################################
+def Latte_XL_2(**kwargs):
+    return Latte(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)
+def Latte_XL_4(**kwargs):
+    return Latte(depth=28, hidden_size=1152, patch_size=4, num_heads=16, **kwargs)
+def Latte_XL_8(**kwargs):
+    return Latte(depth=28, hidden_size=1152, patch_size=8, num_heads=16, **kwargs)
+def Latte_L_2(**kwargs):
+    return Latte(depth=24, hidden_size=1024, patch_size=2, num_heads=16, **kwargs)
+def Latte_L_4(**kwargs):
+    return Latte(depth=24, hidden_size=1024, patch_size=4, num_heads=16, **kwargs)
+def Latte_L_8(**kwargs):
+    return Latte(depth=24, hidden_size=1024, patch_size=8, num_heads=16, **kwargs)
+def Latte_B_2(**kwargs):
+    return Latte(depth=12, hidden_size=768, patch_size=2, num_heads=12, **kwargs)
+def Latte_B_4(**kwargs):
+    return Latte(depth=12, hidden_size=768, patch_size=4, num_heads=12, **kwargs)
+def Latte_B_8(**kwargs):
+    return Latte(depth=12, hidden_size=768, patch_size=8, num_heads=12, **kwargs)
+def Latte_S_2(**kwargs):
+    return Latte(depth=12, hidden_size=384, patch_size=2, num_heads=6, **kwargs)
+def Latte_S_4(**kwargs):
+    return Latte(depth=12, hidden_size=384, patch_size=4, num_heads=6, **kwargs)
+def Latte_S_8(**kwargs):
+    return Latte(depth=12, hidden_size=384, patch_size=8, num_heads=6, **kwargs)
+Latte_models = {
+    'Latte-XL/2': Latte_XL_2,  'Latte-XL/4': Latte_XL_4,  'Latte-XL/8': Latte_XL_8,
+    'Latte-L/2':  Latte_L_2,   'Latte-L/4':  Latte_L_4,   'Latte-L/8':  Latte_L_8,
+    'Latte-B/2':  Latte_B_2,   'Latte-B/4':  Latte_B_4,   'Latte-B/8':  Latte_B_8,
+    'Latte-S/2':  Latte_S_2,   'Latte-S/4':  Latte_S_4,   'Latte-S/8':  Latte_S_8,
+}
+if __name__ == '__main__':
+    import torch
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    img = torch.randn(3, 16, 4, 32, 32).to(device)
+    t = torch.tensor([1, 2, 3]).to(device)
+    y = torch.tensor([1, 2, 3]).to(device)
+    network = Latte_XL_2().to(device)
+    from thop import profile
+    flops, params = profile(network, inputs=(img, t))
+    print('FLOPs = ' + str(flops/1000**3) + 'G')
+    print('Params = ' + str(params/1000**2) + 'M')
+    # y_embeder = LabelEmbedder(num_classes=101, hidden_size=768, dropout_prob=0.5).to(device)
+    # lora.mark_only_lora_as_trainable(network)
+    # out = y_embeder(y, True)
+    # out = network(img, t, y)
+    # print(out.shape)

models/latte_img.py ADDED Viewed

	@@ -0,0 +1,552 @@

+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import math
+import torch
+import torch.nn as nn
+import numpy as np
+from einops import rearrange, repeat
+from timm.models.vision_transformer import Mlp, PatchEmbed
+import os
+import sys
+sys.path.append(os.path.split(sys.path[0])[0])
+# the xformers lib allows less memory, faster training and inference
+try:
+    import xformers
+    import xformers.ops
+except:
+    XFORMERS_IS_AVAILBLE = False
+# from timm.models.layers.helpers import to_2tuple
+# from timm.models.layers.trace_utils import _assert
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+#################################################################################
+#               Attention Layers from TIMM                                      #
+#################################################################################
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0., use_lora=False, attention_mode='math'):
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.attention_mode = attention_mode
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4).contiguous()
+        q, k, v = qkv.unbind(0)   # make torchscript happy (cannot use tensor as tuple)
+        if self.attention_mode == 'xformers': # cause loss nan while using with amp
+            x = xformers.ops.memory_efficient_attention(q, k, v).reshape(B, N, C)
+        elif self.attention_mode == 'flash':
+            # cause loss nan while using with amp
+            # Optionally use the context manager to ensure one of the fused kerenels is run
+            with torch.backends.cuda.sdp_kernel(enable_math=False):
+                x = torch.nn.functional.scaled_dot_product_attention(q, k, v).reshape(B, N, C) # require pytorch 2.0
+        elif self.attention_mode == 'math':
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        else:
+            raise NotImplemented
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These  be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t, use_fp16=False):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        if use_fp16:
+            t_freq = t_freq.to(dtype=torch.float16)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class LabelEmbedder(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+    def forward(self, labels, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+#################################################################################
+#                                 Core Latte Model                                #
+#################################################################################
+class TransformerBlock(nn.Module):
+    """
+    A Latte block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
+        x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of Latte.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class Latte(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        input_size=32,
+        patch_size=2,
+        in_channels=4,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        num_frames=16,
+        class_dropout_prob=0.1,
+        num_classes=1000,
+        learn_sigma=True,
+        extras=2,
+        attention_mode='math',
+    ):
+        super().__init__()
+        self.learn_sigma = learn_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.patch_size = patch_size
+        self.num_heads = num_heads
+        self.extras = extras
+        self.num_frames = num_frames
+        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        if self.extras == 2:
+            self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
+        if self.extras == 78: # timestep + text_embedding
+            self.text_embedding_projection = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(1024, hidden_size, bias=True)
+        )
+        num_patches = self.x_embedder.num_patches
+        # Will use fixed sin-cos embedding:
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, hidden_size), requires_grad=False)
+        self.temp_embed = nn.Parameter(torch.zeros(1, num_frames, hidden_size), requires_grad=False)
+        self.blocks = nn.ModuleList([
+            TransformerBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio, attention_mode=attention_mode) for _ in range(depth)
+        ])
+        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize (and freeze) pos_embed by sin-cos embedding:
+        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches ** 0.5))
+        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))
+        temp_embed = get_1d_sincos_temp_embed(self.temp_embed.shape[-1], self.temp_embed.shape[-2])
+        self.temp_embed.data.copy_(torch.from_numpy(temp_embed).float().unsqueeze(0))
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        if self.extras == 2:
+            # Initialize label embedding table:
+            nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in Latte blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        h = w = int(x.shape[1] ** 0.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
+        return imgs
+    # @torch.cuda.amp.autocast()
+    # @torch.compile
+    def forward(self, x, t, y=None, use_fp16=False, y_image=None, use_image_num=0):
+        """
+        Forward pass of Latte.
+        x: (N, F, C, H, W) tensor of video inputs
+        t: (N,) tensor of diffusion timesteps
+        y: (N,) tensor of class labels
+        y_image: tensor of video frames
+        use_image_num: how many video frames are used
+        """
+        if use_fp16:
+            x = x.to(dtype=torch.float16)
+        batches, frames, channels, high, weight = x.shape
+        x = rearrange(x, 'b f c h w -> (b f) c h w')
+        x = self.x_embedder(x) + self.pos_embed
+        t = self.t_embedder(t, use_fp16=use_fp16)
+        timestep_spatial = repeat(t, 'n d -> (n c) d', c=self.temp_embed.shape[1] + use_image_num)
+        timestep_temp = repeat(t, 'n d -> (n c) d', c=self.pos_embed.shape[1])
+        if self.extras == 2:
+            y = self.y_embedder(y, self.training)
+            if self.training:
+                y_image_emb = []
+                # print(y_image)
+                for y_image_single in y_image:
+                    # print(y_image_single)
+                    y_image_single = y_image_single.reshape(1, -1)
+                    y_image_emb.append(self.y_embedder(y_image_single, self.training))
+                y_image_emb = torch.cat(y_image_emb, dim=0)
+                y_spatial = repeat(y, 'n d -> n c d', c=self.temp_embed.shape[1])
+                y_spatial = torch.cat([y_spatial, y_image_emb], dim=1)
+                y_spatial = rearrange(y_spatial, 'n c d -> (n c) d')
+            else:
+                y_spatial = repeat(y, 'n d -> (n c) d', c=self.temp_embed.shape[1])
+            y_temp = repeat(y, 'n d -> (n c) d', c=self.pos_embed.shape[1])
+        elif self.extras == 78:
+            text_embedding = self.text_embedding_projection(text_embedding)
+            text_embedding_video = text_embedding[:, :1, :]
+            text_embedding_image = text_embedding[:, 1:, :]
+            text_embedding_video = repeat(text_embedding, 'n t d -> n (t c) d', c=self.temp_embed.shape[1])
+            text_embedding_spatial = torch.cat([text_embedding_video, text_embedding_image], dim=1)
+            text_embedding_spatial = rearrange(text_embedding_spatial, 'n t d -> (n t) d')
+            text_embedding_temp = repeat(text_embedding_video, 'n t d -> n (t c) d', c=self.pos_embed.shape[1])
+            text_embedding_temp = rearrange(text_embedding_temp, 'n t d -> (n t) d')
+        for i in range(0, len(self.blocks), 2):
+            spatial_block, temp_block = self.blocks[i:i+2]
+            if self.extras == 2:
+                c = timestep_spatial + y_spatial
+            elif self.extras == 78:
+                c = timestep_spatial + text_embedding_spatial
+            else:
+                c = timestep_spatial
+            x  = spatial_block(x, c)
+            x = rearrange(x, '(b f) t d -> (b t) f d', b=batches)
+            x_video = x[:, :(frames-use_image_num), :]
+            x_image = x[:, (frames-use_image_num):, :]
+            # Add Time Embedding
+            if i == 0:
+                x_video = x_video + self.temp_embed
+            if self.extras == 2:
+                c = timestep_temp + y_temp
+            elif self.extras == 78:
+                c = timestep_temp + text_embedding_temp
+            else:
+                c = timestep_temp
+            x_video = temp_block(x_video, c)
+            x = torch.cat([x_video, x_image], dim=1)
+            x = rearrange(x, '(b t) f d -> (b f) t d', b=batches)
+        if self.extras == 2:
+            c = timestep_spatial + y_spatial
+        else:
+            c = timestep_spatial
+        x = self.final_layer(x, c)
+        x = self.unpatchify(x)
+        x = rearrange(x, '(b f) c h w -> b f c h w', b=batches)
+        # print(x.shape)
+        return x
+    def forward_with_cfg(self, x, t, y, cfg_scale, use_fp16=False):
+        """
+        Forward pass of Latte, but also batches the unconditional forward pass for classifier-free guidance.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0)
+        if use_fp16:
+            combined = combined.to(dtype=torch.float16)
+        model_out = self.forward(combined, t, y, use_fp16=use_fp16)
+        # For exact reproducibility reasons, we apply classifier-free guidance on only
+        # three channels by default. The standard approach to cfg applies it to all channels.
+        # This can be done by uncommenting the following line and commenting-out the line following that.
+        # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+        # eps, rest = model_out[:, :3], model_out[:, 3:]
+        eps, rest = model_out[:, :, :4, ...], model_out[:, :, 4:, ...] # 2 16 4 32 32
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=2)
+#################################################################################
+#                   Sine/Cosine Positional Embedding Functions                  #
+#################################################################################
+# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+def get_1d_sincos_temp_embed(embed_dim, length):
+    pos = torch.arange(0, length).unsqueeze(1)
+    return get_1d_sincos_pos_embed_from_grid(embed_dim, pos)
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+#################################################################################
+#                                   Latte Configs                                  #
+#################################################################################
+def Latte_XL_2(**kwargs):
+    return Latte(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)
+def Latte_XL_4(**kwargs):
+    return Latte(depth=28, hidden_size=1152, patch_size=4, num_heads=16, **kwargs)
+def Latte_XL_8(**kwargs):
+    return Latte(depth=28, hidden_size=1152, patch_size=8, num_heads=16, **kwargs)
+def Latte_L_2(**kwargs):
+    return Latte(depth=24, hidden_size=1024, patch_size=2, num_heads=16, **kwargs)
+def Latte_L_4(**kwargs):
+    return Latte(depth=24, hidden_size=1024, patch_size=4, num_heads=16, **kwargs)
+def Latte_L_8(**kwargs):
+    return Latte(depth=24, hidden_size=1024, patch_size=8, num_heads=16, **kwargs)
+def Latte_B_2(**kwargs):
+    return Latte(depth=12, hidden_size=768, patch_size=2, num_heads=12, **kwargs)
+def Latte_B_4(**kwargs):
+    return Latte(depth=12, hidden_size=768, patch_size=4, num_heads=12, **kwargs)
+def Latte_B_8(**kwargs):
+    return Latte(depth=12, hidden_size=768, patch_size=8, num_heads=12, **kwargs)
+def Latte_S_2(**kwargs):
+    return Latte(depth=12, hidden_size=384, patch_size=2, num_heads=6, **kwargs)
+def Latte_S_4(**kwargs):
+    return Latte(depth=12, hidden_size=384, patch_size=4, num_heads=6, **kwargs)
+def Latte_S_8(**kwargs):
+    return Latte(depth=12, hidden_size=384, patch_size=8, num_heads=6, **kwargs)
+LatteIMG_models = {
+    'LatteIMG-XL/2': Latte_XL_2,  'LatteIMG-XL/4': Latte_XL_4,  'LatteIMG-XL/8': Latte_XL_8,
+    'LatteIMG-L/2':  Latte_L_2,   'LatteIMG-L/4':  Latte_L_4,   'LatteIMG-L/8':  Latte_L_8,
+    'LatteIMG-B/2':  Latte_B_2,   'LatteIMG-B/4':  Latte_B_4,   'LatteIMG-B/8':  Latte_B_8,
+    'LatteIMG-S/2':  Latte_S_2,   'LatteIMG-S/4':  Latte_S_4,   'LatteIMG-S/8':  Latte_S_8,
+}
+if __name__ == '__main__':
+    import torch
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    use_image_num = 8
+    img = torch.randn(3, 16+use_image_num, 4, 32, 32).to(device)
+    t = torch.tensor([1, 2, 3]).to(device)
+    y = torch.tensor([1, 2, 3]).to(device)
+    y_image = [torch.tensor([48, 37, 72, 63, 74, 6, 7, 8]).to(device),
+               torch.tensor([37, 72, 63, 74, 70, 1, 2, 3]).to(device),
+               torch.tensor([72, 63, 74, 70, 71, 5, 8, 7]).to(device),
+              ]
+    network = Latte_XL_2().to(device)
+    network.train()
+    out = network(img, t, y=y, y_image=y_image, use_image_num=use_image_num)
+    print(out.shape)

models/latte_t2v.py ADDED Viewed

	@@ -0,0 +1,945 @@

+import torch
+import os
+import json
+from dataclasses import dataclass
+from einops import rearrange, repeat
+from typing import Any, Dict, Optional, Tuple
+from diffusers.models import Transformer2DModel
+from diffusers.utils import USE_PEFT_BACKEND, BaseOutput, deprecate
+from diffusers.models.embeddings import get_1d_sincos_pos_embed_from_grid, ImagePositionalEmbeddings, CaptionProjection, PatchEmbed, CombinedTimestepSizeEmbeddings
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.attention import BasicTransformerBlock
+from diffusers.models.lora import LoRACompatibleConv, LoRACompatibleLinear
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.embeddings import SinusoidalPositionalEmbedding
+from diffusers.models.normalization import AdaLayerNorm, AdaLayerNormZero
+from diffusers.models.attention_processor import Attention
+from diffusers.models.activations import GEGLU, GELU, ApproximateGELU
+from dataclasses import dataclass
+import torch
+import torch.nn.functional as F
+from torch import nn
+@maybe_allow_in_graph
+class GatedSelfAttentionDense(nn.Module):
+    r"""
+    A gated self-attention dense layer that combines visual features and object features.
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        context_dim (`int`): The number of channels in the context.
+        n_heads (`int`): The number of heads to use for attention.
+        d_head (`int`): The number of channels in each head.
+    """
+    def __init__(self, query_dim: int, context_dim: int, n_heads: int, d_head: int):
+        super().__init__()
+        # we need a linear projection since we need cat visual feature and obj feature
+        self.linear = nn.Linear(context_dim, query_dim)
+        self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
+        self.ff = FeedForward(query_dim, activation_fn="geglu")
+        self.norm1 = nn.LayerNorm(query_dim)
+        self.norm2 = nn.LayerNorm(query_dim)
+        self.register_parameter("alpha_attn", nn.Parameter(torch.tensor(0.0)))
+        self.register_parameter("alpha_dense", nn.Parameter(torch.tensor(0.0)))
+        self.enabled = True
+    def forward(self, x: torch.Tensor, objs: torch.Tensor) -> torch.Tensor:
+        if not self.enabled:
+            return x
+        n_visual = x.shape[1]
+        objs = self.linear(objs)
+        x = x + self.alpha_attn.tanh() * self.attn(self.norm1(torch.cat([x, objs], dim=1)))[:, :n_visual, :]
+        x = x + self.alpha_dense.tanh() * self.ff(self.norm2(x))
+        return x
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+    """
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+    ):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        linear_cls = LoRACompatibleLinear if not USE_PEFT_BACKEND else nn.Linear
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh")
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim)
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(linear_cls(inner_dim, dim_out))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+    def forward(self, hidden_states: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
+        compatible_cls = (GEGLU,) if USE_PEFT_BACKEND else (GEGLU, LoRACompatibleLinear)
+        for module in self.net:
+            if isinstance(module, compatible_cls):
+                hidden_states = module(hidden_states, scale)
+            else:
+                hidden_states = module(hidden_states)
+        return hidden_states
+@maybe_allow_in_graph
+class BasicTransformerBlock_(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+        attention_type (`str`, *optional*, defaults to `"default"`):
+            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+        positional_embeddings (`str`, *optional*, defaults to `None`):
+            The type of positional embeddings to apply to.
+        num_positional_embeddings (`int`, *optional*, defaults to `None`):
+            The maximum number of positional embeddings to apply.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",  # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single'
+        norm_eps: float = 1e-5,
+        final_dropout: bool = False,
+        attention_type: str = "default",
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
+        self.use_layer_norm = norm_type == "layer_norm"
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+        if positional_embeddings and (num_positional_embeddings is None):
+            raise ValueError(
+                "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+            )
+        if positional_embeddings == "sinusoidal":
+            self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
+        else:
+            self.pos_embed = None
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif self.use_ada_layer_norm_zero:
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps) # go here
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+        )
+        # # 2. Cross-Attn
+        # if cross_attention_dim is not None or double_self_attention:
+        #     # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+        #     # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+        #     # the second cross attention block.
+        #     self.norm2 = (
+        #         AdaLayerNorm(dim, num_embeds_ada_norm)
+        #         if self.use_ada_layer_norm
+        #         else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+        #     )
+        #     self.attn2 = Attention(
+        #         query_dim=dim,
+        #         cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+        #         heads=num_attention_heads,
+        #         dim_head=attention_head_dim,
+        #         dropout=dropout,
+        #         bias=attention_bias,
+        #         upcast_attention=upcast_attention,
+        #     )  # is self-attn if encoder_hidden_states is none
+        # else:
+        #     self.norm2 = None
+        #     self.attn2 = None
+        # 3. Feed-forward
+        # if not self.use_ada_layer_norm_single:
+        #     self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+        # 4. Fuser
+        if attention_type == "gated" or attention_type == "gated-text-image":
+            self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
+        # 5. Scale-shift for PixArt-Alpha.
+        if self.use_ada_layer_norm_single:
+            self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+    ) -> torch.FloatTensor:
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+        if self.use_ada_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.use_ada_layer_norm_zero:
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        elif self.use_layer_norm:
+            norm_hidden_states = self.norm1(hidden_states)
+        elif self.use_ada_layer_norm_single: # go here
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+            ).chunk(6, dim=1)
+            norm_hidden_states = self.norm1(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+            # norm_hidden_states = norm_hidden_states.squeeze(1)
+        else:
+            raise ValueError("Incorrect norm used")
+        if self.pos_embed is not None:
+            norm_hidden_states = self.pos_embed(norm_hidden_states)
+        # 1. Retrieve lora scale.
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        # 2. Prepare GLIGEN inputs
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+        if self.use_ada_layer_norm_zero:
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        elif self.use_ada_layer_norm_single:
+            attn_output = gate_msa * attn_output
+        hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+        # 2.5 GLIGEN Control
+        if gligen_kwargs is not None:
+            hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+        # # 3. Cross-Attention
+        # if self.attn2 is not None:
+        #     if self.use_ada_layer_norm:
+        #         norm_hidden_states = self.norm2(hidden_states, timestep)
+        #     elif self.use_ada_layer_norm_zero or self.use_layer_norm:
+        #         norm_hidden_states = self.norm2(hidden_states)
+        #     elif self.use_ada_layer_norm_single:
+        #         # For PixArt norm2 isn't applied here:
+        #         # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+        #         norm_hidden_states = hidden_states
+        #     else:
+        #         raise ValueError("Incorrect norm")
+        #     if self.pos_embed is not None and self.use_ada_layer_norm_single is False:
+        #         norm_hidden_states = self.pos_embed(norm_hidden_states)
+        #     attn_output = self.attn2(
+        #         norm_hidden_states,
+        #         encoder_hidden_states=encoder_hidden_states,
+        #         attention_mask=encoder_attention_mask,
+        #         **cross_attention_kwargs,
+        #     )
+        #     hidden_states = attn_output + hidden_states
+        # 4. Feed-forward
+        # if not self.use_ada_layer_norm_single:
+        #     norm_hidden_states = self.norm3(hidden_states)
+        if self.use_ada_layer_norm_zero:
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if self.use_ada_layer_norm_single:
+            # norm_hidden_states = self.norm2(hidden_states)
+            norm_hidden_states = self.norm3(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
+                raise ValueError(
+                    f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+                )
+            num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
+            ff_output = torch.cat(
+                [
+                    self.ff(hid_slice, scale=lora_scale)
+                    for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)
+                ],
+                dim=self._chunk_dim,
+            )
+        else:
+            ff_output = self.ff(norm_hidden_states, scale=lora_scale)
+        if self.use_ada_layer_norm_zero:
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        elif self.use_ada_layer_norm_single:
+            ff_output = gate_mlp * ff_output
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+        return hidden_states
+class AdaLayerNormSingle(nn.Module):
+    r"""
+    Norm layer adaptive layer norm single (adaLN-single).
+    As proposed in PixArt-Alpha (see: https://arxiv.org/abs/2310.00426; Section 2.3).
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        use_additional_conditions (`bool`): To use additional conditions for normalization or not.
+    """
+    def __init__(self, embedding_dim: int, use_additional_conditions: bool = False):
+        super().__init__()
+        self.emb = CombinedTimestepSizeEmbeddings(
+            embedding_dim, size_emb_dim=embedding_dim // 3, use_additional_conditions=use_additional_conditions
+        )
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        added_cond_kwargs: Dict[str, torch.Tensor] = None,
+        batch_size: int = None,
+        hidden_dtype: Optional[torch.dtype] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # No modulation happening here.
+        embedded_timestep = self.emb(timestep, batch_size=batch_size, hidden_dtype=hidden_dtype, resolution=None, aspect_ratio=None)
+        return self.linear(self.silu(embedded_timestep)), embedded_timestep
+@dataclass
+class Transformer3DModelOutput(BaseOutput):
+    """
+    The output of [`Transformer2DModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
+            distributions for the unnoised latent pixels.
+    """
+    sample: torch.FloatTensor
+class LatteT2V(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+    """
+    A 2D Transformer model for image-like data.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        num_vector_embeds (`int`, *optional*):
+            The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*):
+            The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+            `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+            added to the hidden states.
+            During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlocks` attention should contain a bias parameter.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        attention_type: str = "default",
+        caption_channels: int = None,
+        video_length: int = 16,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        self.video_length = video_length
+        conv_cls = nn.Conv2d if USE_PEFT_BACKEND else LoRACompatibleConv
+        linear_cls = nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear
+        # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = (in_channels is not None) and (patch_size is None)
+        self.is_input_vectorized = num_vector_embeds is not None
+        self.is_input_patches = in_channels is not None and patch_size is not None
+        if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
+            deprecation_message = (
+                f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
+                " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
+                " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
+                " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
+                " would be very nice if you could open a Pull request for the `transformer/config.json` file"
+            )
+            deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
+            norm_type = "ada_norm"
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif self.is_input_vectorized and self.is_input_patches:
+            raise ValueError(
+                f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
+                " sure that either `num_vector_embeds` or `num_patches` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
+            raise ValueError(
+                f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
+                f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
+            )
+        # 2. Define input layers
+        if self.is_input_continuous:
+            self.in_channels = in_channels
+            self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+            if use_linear_projection:
+                self.proj_in = linear_cls(in_channels, inner_dim)
+            else:
+                self.proj_in = conv_cls(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
+            self.height = sample_size
+            self.width = sample_size
+            self.num_vector_embeds = num_vector_embeds
+            self.num_latent_pixels = self.height * self.width
+            self.latent_image_embedding = ImagePositionalEmbeddings(
+                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
+            )
+        elif self.is_input_patches:
+            assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
+            self.height = sample_size
+            self.width = sample_size
+            self.patch_size = patch_size
+            interpolation_scale = self.config.sample_size // 64  # => 64 (= 512 pixart) has interpolation scale 1
+            interpolation_scale = max(interpolation_scale, 1)
+            self.pos_embed = PatchEmbed(
+                height=sample_size,
+                width=sample_size,
+                patch_size=patch_size,
+                in_channels=in_channels,
+                embed_dim=inner_dim,
+                interpolation_scale=interpolation_scale,
+            )
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    double_self_attention=double_self_attention,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                    attention_type=attention_type,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        # Define temporal transformers blocks
+        self.temporal_transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock_( # one attention
+                    inner_dim,
+                    num_attention_heads, # num_attention_heads
+                    attention_head_dim, # attention_head_dim 72
+                    dropout=dropout,
+                    cross_attention_dim=None,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    double_self_attention=False,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                    attention_type=attention_type,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        if self.is_input_continuous:
+            # TODO: should use out_channels for continuous projections
+            if use_linear_projection:
+                self.proj_out = linear_cls(inner_dim, in_channels)
+            else:
+                self.proj_out = conv_cls(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            self.norm_out = nn.LayerNorm(inner_dim)
+            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
+        elif self.is_input_patches and norm_type != "ada_norm_single":
+            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+            self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
+            self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+        elif self.is_input_patches and norm_type == "ada_norm_single":
+            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+            self.scale_shift_table = nn.Parameter(torch.randn(2, inner_dim) / inner_dim**0.5)
+            self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+        # 5. PixArt-Alpha blocks.
+        self.adaln_single = None
+        self.use_additional_conditions = False
+        if norm_type == "ada_norm_single":
+            self.use_additional_conditions = self.config.sample_size == 128 # False, 128 -> 1024
+            # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use
+            # additional conditions until we find better name
+            self.adaln_single = AdaLayerNormSingle(inner_dim, use_additional_conditions=self.use_additional_conditions)
+        self.caption_projection = None
+        if caption_channels is not None:
+            self.caption_projection = CaptionProjection(in_features=caption_channels, hidden_size=inner_dim)
+        self.gradient_checkpointing = False
+        # define temporal positional embedding
+        temp_pos_embed = self.get_1d_sincos_temp_embed(inner_dim, video_length) # 1152 hidden size
+        self.register_buffer("temp_pos_embed", torch.from_numpy(temp_pos_embed).float().unsqueeze(0), persistent=False)
+    def _set_gradient_checkpointing(self, module, value=False):
+        self.gradient_checkpointing = value
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        added_cond_kwargs: Dict[str, torch.Tensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        use_image_num: int = 0,
+        enable_temporal_attentions: bool = True,
+        return_dict: bool = True,
+    ):
+        """
+        The [`Transformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, frame, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            attention_mask ( `torch.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        input_batch_size, c, frame, h, w = hidden_states.shape
+        frame = frame - use_image_num
+        hidden_states = rearrange(hidden_states, 'b c f h w -> (b f) c h w').contiguous()
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2: # ndim == 2 means no image joint
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+            encoder_attention_mask = repeat(encoder_attention_mask, 'b 1 l -> (b f) 1 l', f=frame).contiguous()
+        elif encoder_attention_mask is not None and encoder_attention_mask.ndim == 3: # ndim == 3 means image joint
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask_video = encoder_attention_mask[:, :1, ...]
+            encoder_attention_mask_video = repeat(encoder_attention_mask_video, 'b 1 l -> b (1 f) l', f=frame).contiguous()
+            encoder_attention_mask_image = encoder_attention_mask[:, 1:, ...]
+            encoder_attention_mask = torch.cat([encoder_attention_mask_video, encoder_attention_mask_image], dim=1)
+            encoder_attention_mask = rearrange(encoder_attention_mask, 'b n l -> (b n) l').contiguous().unsqueeze(1)
+        # Retrieve lora scale.
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        # 1. Input
+        if self.is_input_patches: # here
+            height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
+            num_patches = height * width
+            hidden_states = self.pos_embed(hidden_states) # alrady add positional embeddings
+            if self.adaln_single is not None:
+                if self.use_additional_conditions and added_cond_kwargs is None:
+                    raise ValueError(
+                        "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
+                    )
+                # batch_size = hidden_states.shape[0]
+                batch_size = input_batch_size
+                timestep, embedded_timestep = self.adaln_single(
+                    timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+                )
+        # 2. Blocks
+        if self.caption_projection is not None:
+            batch_size = hidden_states.shape[0]
+            encoder_hidden_states = self.caption_projection(encoder_hidden_states) # 3 120 1152
+            if use_image_num != 0 and self.training:
+                encoder_hidden_states_video = encoder_hidden_states[:, :1, ...]
+                encoder_hidden_states_video = repeat(encoder_hidden_states_video, 'b 1 t d -> b (1 f) t d', f=frame).contiguous()
+                encoder_hidden_states_image = encoder_hidden_states[:, 1:, ...]
+                encoder_hidden_states = torch.cat([encoder_hidden_states_video, encoder_hidden_states_image], dim=1)
+                encoder_hidden_states_spatial = rearrange(encoder_hidden_states, 'b f t d -> (b f) t d').contiguous()
+            else:
+                encoder_hidden_states_spatial = repeat(encoder_hidden_states, 'b t d -> (b f) t d', f=frame).contiguous()
+        # prepare timesteps for spatial and temporal block
+        timestep_spatial = repeat(timestep, 'b d -> (b f) d', f=frame + use_image_num).contiguous()
+        timestep_temp = repeat(timestep, 'b d -> (b p) d', p=num_patches).contiguous()
+        for i, (spatial_block, temp_block) in enumerate(zip(self.transformer_blocks, self.temporal_transformer_blocks)):
+            if self.training and self.gradient_checkpointing:
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    spatial_block,
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states_spatial,
+                    encoder_attention_mask,
+                    timestep_spatial,
+                    cross_attention_kwargs,
+                    class_labels,
+                    use_reentrant=False,
+                )
+                if enable_temporal_attentions:
+                    hidden_states = rearrange(hidden_states, '(b f) t d -> (b t) f d', b=input_batch_size).contiguous()
+                    if use_image_num != 0: # image-video joitn training
+                        hidden_states_video = hidden_states[:, :frame, ...]
+                        hidden_states_image = hidden_states[:, frame:, ...]
+                        if i == 0:
+                            hidden_states_video = hidden_states_video + self.temp_pos_embed
+                        hidden_states_video = torch.utils.checkpoint.checkpoint(
+                            temp_block,
+                            hidden_states_video,
+                            None, # attention_mask
+                            None, # encoder_hidden_states
+                            None, # encoder_attention_mask
+                            timestep_temp,
+                            cross_attention_kwargs,
+                            class_labels,
+                            use_reentrant=False,
+                        )
+                        hidden_states = torch.cat([hidden_states_video, hidden_states_image], dim=1)
+                        hidden_states = rearrange(hidden_states, '(b t) f d -> (b f) t d', b=input_batch_size).contiguous()
+                    else:
+                        if i == 0:
+                            hidden_states = hidden_states + self.temp_pos_embed
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            temp_block,
+                            hidden_states,
+                            None, # attention_mask
+                            None, # encoder_hidden_states
+                            None, # encoder_attention_mask
+                            timestep_temp,
+                            cross_attention_kwargs,
+                            class_labels,
+                            use_reentrant=False,
+                        )
+                        hidden_states = rearrange(hidden_states, '(b t) f d -> (b f) t d', b=input_batch_size).contiguous()
+            else:
+                hidden_states = spatial_block(
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states_spatial,
+                    encoder_attention_mask,
+                    timestep_spatial,
+                    cross_attention_kwargs,
+                    class_labels,
+                )
+                if enable_temporal_attentions:
+                    hidden_states = rearrange(hidden_states, '(b f) t d -> (b t) f d', b=input_batch_size).contiguous()
+                    if use_image_num != 0 and self.training:
+                        hidden_states_video = hidden_states[:, :frame, ...]
+                        hidden_states_image = hidden_states[:, frame:, ...]
+                        hidden_states_video = temp_block(
+                            hidden_states_video,
+                            None, # attention_mask
+                            None, # encoder_hidden_states
+                            None, # encoder_attention_mask
+                            timestep_temp,
+                            cross_attention_kwargs,
+                            class_labels,
+                        )
+                        hidden_states = torch.cat([hidden_states_video, hidden_states_image], dim=1)
+                        hidden_states = rearrange(hidden_states, '(b t) f d -> (b f) t d', b=input_batch_size).contiguous()
+                    else:
+                        if i == 0 and frame > 1:
+                            hidden_states = hidden_states + self.temp_pos_embed
+                        hidden_states = temp_block(
+                            hidden_states,
+                            None, # attention_mask
+                            None, # encoder_hidden_states
+                            None, # encoder_attention_mask
+                            timestep_temp,
+                            cross_attention_kwargs,
+                            class_labels,
+                        )
+                        hidden_states = rearrange(hidden_states, '(b t) f d -> (b f) t d', b=input_batch_size).contiguous()
+        if self.is_input_patches:
+            if self.config.norm_type != "ada_norm_single":
+                conditioning = self.transformer_blocks[0].norm1.emb(
+                    timestep, class_labels, hidden_dtype=hidden_states.dtype
+                )
+                shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+                hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+                hidden_states = self.proj_out_2(hidden_states)
+            elif self.config.norm_type == "ada_norm_single":
+                embedded_timestep = repeat(embedded_timestep, 'b d -> (b f) d', f=frame + use_image_num).contiguous()
+                shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
+                hidden_states = self.norm_out(hidden_states)
+                # Modulation
+                hidden_states = hidden_states * (1 + scale) + shift
+                hidden_states = self.proj_out(hidden_states)
+            # unpatchify
+            if self.adaln_single is None:
+                height = width = int(hidden_states.shape[1] ** 0.5)
+            hidden_states = hidden_states.reshape(
+                shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+            )
+            hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+            output = hidden_states.reshape(
+                shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+            )
+            output = rearrange(output, '(b f) c h w -> b c f h w', b=input_batch_size).contiguous()
+        if not return_dict:
+            return (output,)
+        return Transformer3DModelOutput(sample=output)
+    def get_1d_sincos_temp_embed(self, embed_dim, length):
+        pos = torch.arange(0, length).unsqueeze(1)
+        return get_1d_sincos_pos_embed_from_grid(embed_dim, pos)

models/utils.py ADDED Viewed

	@@ -0,0 +1,215 @@

+# adopted from
+# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+# and
+# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+# and
+# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
+#
+# thanks!
+import os
+import math
+import torch
+import numpy as np
+import torch.nn as nn
+from einops import repeat
+#################################################################################
+#                                  Unet Utils                                   #
+#################################################################################
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+class CheckpointFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+        with torch.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+    @staticmethod
+    def backward(ctx, *output_grads):
+        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
+        with torch.enable_grad():
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = torch.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads
+def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    else:
+        embedding = repeat(timesteps, 'b -> b d', d=dim).contiguous()
+    return embedding
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def normalization(channels):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(32, channels)
+# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
+class SiLU(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(x)
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+# class HybridConditioner(nn.Module):
+#     def __init__(self, c_concat_config, c_crossattn_config):
+#         super().__init__()
+#         self.concat_conditioner = instantiate_from_config(c_concat_config)
+#         self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
+#     def forward(self, c_concat, c_crossattn):
+#         c_concat = self.concat_conditioner(c_concat)
+#         c_crossattn = self.crossattn_conditioner(c_crossattn)
+#         return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
+def noise_like(shape, device, repeat=False):
+    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
+    noise = lambda: torch.randn(shape, device=device)
+    return repeat_noise() if repeat else noise()
+def count_flops_attn(model, _x, y):
+    """
+    A counter for the `thop` package to count the operations in an
+    attention operation.
+    Meant to be used like:
+        macs, params = thop.profile(
+            model,
+            inputs=(inputs, timestamps),
+            custom_ops={QKVAttention: QKVAttention.count_flops},
+        )
+    """
+    b, c, *spatial = y[0].shape
+    num_spatial = int(np.prod(spatial))
+    # We perform two matmuls with the same number of ops.
+    # The first computes the weight matrix, the second computes
+    # the combination of the value vectors.
+    matmul_ops = 2 * b * (num_spatial ** 2) * c
+    model.total_ops += torch.DoubleTensor([matmul_ops])
+def count_params(model, verbose=False):
+    total_params = sum(p.numel() for p in model.parameters())
+    if verbose:
+        print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
+    return total_params

sample/__pycache__/pipeline_latte.cpython-312.pyc ADDED Viewed

Binary file (35.4 kB). View file

sample/ffs.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/bin/bash
+export CUDA_VISIBLE_DEVICES=7
+python sample/sample.py \
+--config ./configs/ffs/ffs_sample.yaml \
+--ckpt ./share_ckpts/ffs.pt \
+--save_video_path ./test

sample/ffs_ddp.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/bin/bash
+export CUDA_VISIBLE_DEVICES=6,7
+torchrun --nnodes=1 --nproc_per_node=2 sample/sample_ddp.py \
+--config ./configs/ffs/ffs_sample.yaml \
+--ckpt ./share_ckpts/ffs.pt \
+--save_video_path ./test