AeroXi commited on
Commit
ece766c
1 Parent(s): a862cc5

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. .gitignore +18 -0
  3. MIST_logo.png +0 -0
  4. README.md +238 -7
  5. assets/MIST_V2_LOGO.png +0 -0
  6. assets/effect_show.png +0 -0
  7. assets/output_image.png +0 -0
  8. assets/output_image_box.png +0 -0
  9. assets/robustness.png +3 -0
  10. assets/user_2.jpg +3 -0
  11. assets/user_case_1.png +3 -0
  12. assets/user_case_2.png +3 -0
  13. attacks/mist.py +1156 -0
  14. attacks/utils.py +113 -0
  15. data/MIST.png +0 -0
  16. eval/sample_lora_15.ipynb +0 -0
  17. eval/train_dreambooth_lora_15.py +1007 -0
  18. ldm/configs/karlo/decoder_900M_vit_l.yaml +37 -0
  19. ldm/configs/karlo/improved_sr_64_256_1.4B.yaml +27 -0
  20. ldm/configs/karlo/prior_1B_vit_l.yaml +21 -0
  21. ldm/configs/stable-diffusion/intel/v2-inference-bf16.yaml +71 -0
  22. ldm/configs/stable-diffusion/intel/v2-inference-fp32.yaml +70 -0
  23. ldm/configs/stable-diffusion/intel/v2-inference-v-bf16.yaml +72 -0
  24. ldm/configs/stable-diffusion/intel/v2-inference-v-fp32.yaml +71 -0
  25. ldm/configs/stable-diffusion/v2-1-stable-unclip-h-inference.yaml +80 -0
  26. ldm/configs/stable-diffusion/v2-1-stable-unclip-l-inference.yaml +83 -0
  27. ldm/configs/stable-diffusion/v2-inference-v.yaml +68 -0
  28. ldm/configs/stable-diffusion/v2-inference.yaml +67 -0
  29. ldm/configs/stable-diffusion/v2-inpainting-inference.yaml +158 -0
  30. ldm/configs/stable-diffusion/v2-midas-inference.yaml +74 -0
  31. ldm/configs/stable-diffusion/x4-upscaling.yaml +76 -0
  32. ldm/data/__init__.py +0 -0
  33. ldm/data/util.py +24 -0
  34. ldm/models/autoencoder.py +219 -0
  35. ldm/models/diffusion/__init__.py +0 -0
  36. ldm/models/diffusion/ddim.py +337 -0
  37. ldm/models/diffusion/ddpm.py +1884 -0
  38. ldm/models/diffusion/dpm_solver/__init__.py +1 -0
  39. ldm/models/diffusion/dpm_solver/dpm_solver.py +1163 -0
  40. ldm/models/diffusion/dpm_solver/sampler.py +96 -0
  41. ldm/models/diffusion/plms.py +245 -0
  42. ldm/models/diffusion/sampling_util.py +22 -0
  43. ldm/modules/attention.py +341 -0
  44. ldm/modules/diffusionmodules/__init__.py +0 -0
  45. ldm/modules/diffusionmodules/model.py +852 -0
  46. ldm/modules/diffusionmodules/openaimodel.py +807 -0
  47. ldm/modules/diffusionmodules/upscaling.py +81 -0
  48. ldm/modules/diffusionmodules/util.py +278 -0
  49. ldm/modules/distributions/__init__.py +0 -0
  50. ldm/modules/distributions/distributions.py +92 -0
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/robustness.png filter=lfs diff=lfs merge=lfs -text
37
+ assets/user_2.jpg filter=lfs diff=lfs merge=lfs -text
38
+ assets/user_case_1.png filter=lfs diff=lfs merge=lfs -text
39
+ assets/user_case_2.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.bin
2
+ *.ckpt
3
+ *.pt
4
+ logs/
5
+ *.safetensors
6
+ *.jpg
7
+ *.png
8
+ !data/MIST.png
9
+ !/MIST_logo.png
10
+ *.zip
11
+ __pycache__/
12
+ stable-diffusion/*/
13
+ test/
14
+ *.pkl
15
+ data/training/*
16
+ output/lora/*
17
+ output/mist/*
18
+ !assets/*
MIST_logo.png ADDED
README.md CHANGED
@@ -1,12 +1,243 @@
1
  ---
2
- title: Mist V2
3
- emoji: 👁
4
- colorFrom: blue
5
- colorTo: gray
6
  sdk: gradio
7
  sdk_version: 4.11.0
8
- app_file: app.py
9
- pinned: false
10
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: mist-v2
3
+ app_file: mist-webui.py
 
 
4
  sdk: gradio
5
  sdk_version: 4.11.0
 
 
6
  ---
7
+ <p align="center">
8
+ <br>
9
+ <!-- <img src="mist_logo.png"> -->
10
+ <img src="assets/MIST_V2_LOGO.png">
11
+ <br>
12
+ </p>
13
+
14
+
15
+ [![project page](https://img.shields.io/badge/homepage-mist--project.io-blue.svg)](https://mist-project.github.io/index_en.html)
16
+ [![arXiv](https://img.shields.io/badge/arXiv-2310.04687-red.svg)](https://arxiv.org/abs/2310.04687)
17
+ <!--
18
+ [![document](https://img.shields.io/badge/document-passing-light_green.svg)](https://arxiv.org/abs/2310.04687)
19
+ -->
20
+ <!--
21
+ ### [project page](https://mist-project.github.io) | [arxiv](https://arxiv.org/abs/2310.04687) | [document](https://arxiv.org/abs/2310.04687) -->
22
+
23
+ <!-- #region -->
24
+ <!-- <p align="center">
25
+ <img src="effect_show.png">
26
+ </p> -->
27
+ <!-- #endregion -->
28
+ <!--
29
+ > Mist adds watermarks to images, making them unrecognizable and unusable for AI-for-Art models that try to mimic them. -->
30
+
31
+ <!-- #region -->
32
+ <p align="center">
33
+ <img src="assets/user_2.jpg">
34
+ </p>
35
+ <!-- <p align="center">
36
+ <img src="user_case_2.png">
37
+ </p> -->
38
+ <!-- #endregion -->
39
+
40
+ > Mist's Effects in User Cases. **The first row:** Lora generation from source images.
41
+ **The second row:** Lora generation from Mist-treated samples. Mist V2 significantly disrupts the output of the generation, effectively protecting artists' images. Used images are from anonymous artists. All rights reserved.
42
+ <!-- #region -->
43
+ <!-- <p align="center">
44
+ <img src="robustness.png">
45
+ </p> -->
46
+ <!-- #endregion -->
47
+
48
+ <!-- > Robustness of Mist against image preprocessing. -->
49
+
50
+ <!-- ## News
51
+
52
+ **2022/12/11**: Mist V2 released. -->
53
+
54
+ ## Main Features
55
+ - Enhanced protection against AI-for-Art applications like Lora and SDEdit
56
+ - Imperceptible noise.
57
+ - 3-5 minutes processing with only 6GB of GPU memory in most cases. CPU processing supported.
58
+ - Resilience against denoising methods.
59
+
60
+
61
+ ## About Mist
62
+ Mist is a powerful image preprocessing tool designed for the purpose of protecting the style and content of
63
+ images from being mimicked by state-of-the-art AI-for-Art applications. By adding watermarks to the images, Mist renders them unrecognizable and inmitable for the
64
+ models employed by AI-for-Art applications. Attempts by AI-for-Art applications to mimic these Misted images
65
+ will be ineffective, and the output image of such mimicry will be scrambled and unusable as artwork.
66
+
67
+
68
+ <p align="center">
69
+ <img src="assets/effect_show.png">
70
+ </p>
71
+
72
+ In Mist V2, we have enhanced its effectiveness against a wider range of AI-for-Art applications, particularly excelling with Lora. Mist V2 achieves robust defense with even more discreet watermarks compared to [Mist V1](https://github.com/mist-project/mist). Additionally, Mist V2 introduces support for CPU processing and can efficiently run on GPUs with as little as 6GB of memory in most cases.
73
+
74
+
75
+ <!-- For more details, refer to our [documentation](https://arxiv.org/abs/2310.04687). -->
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+ ## Quick Start
84
+
85
+ ### Environment
86
+
87
+ **Preliminaries:** To run this repository, please have [Anaconda](https://pytorch.org/) installed in your work station. The GPU version of Mist requires a NVIDIA GPU in [Ampere](https://en.wikipedia.org/wiki/Ampere_(microarchitecture)) or more advanced architecture with more than 6GB VRAM. You can also try the CPU version
88
+ in a moderate running speed.
89
+
90
+ Clone this repository to your local and get into the repository root:
91
+
92
+ ```bash
93
+ git clone https://github.com/mist-project/mist-v2.git
94
+ cd mist-v2
95
+ ```
96
+
97
+ Then, run the following commands in the root of the repository to install the environment:
98
+
99
+ ```bash
100
+ conda create -n mist-v2 python=3.10
101
+ conda activate mist-v2
102
+ pip install -r requirements.txt
103
+ ```
104
+
105
+ ### Usage
106
+
107
+ Run Mist V2 in the default setup on GPU:
108
+ ```bash
109
+ accelerate launch attacks/mist.py --cuda --low_vram_mode --instance_data_dir $INSTANCE_DIR --output_dir $OUTPUT_DIR --class_data_dir $CLASS_DATA_DIR --instance_prompt $PROMPT --class_prompt $CLASS_PROMPT --mixed_precision bf16
110
+ ```
111
+
112
+ Run Mist V2 in the default setup on CPU:
113
+ ```bash
114
+ accelerate launch attacks/mist.py --instance_data_dir $INSTANCE_DIR --output_dir $OUTPUT_DIR --class_data_dir $CLASS_DATA_DIR --instance_prompt $PROMPT --class_prompt $CLASS_PROMPT --mixed_precision bf16
115
+ ```
116
+
117
+ The parameters are demonstrated in the following table:
118
+
119
+ | Parameter | Explanation |
120
+ | --------------- | ------------------------------------------------------------------------------------------ |
121
+ | $INSTANCE_DIR | Directory of input clean images. The goal is to add adversarial noise to them. |
122
+ | $OUTPUT_DIR | Directory for output adversarial examples (misted images). |
123
+ | $CLASS_DATA_DIR | Directory for class data in prior preserved training of Dreambooth, required to be empty. |
124
+ | $PROMPT | Prompt that describes the input clean images, used to perturb the images. |
125
+ | $CLASS_PROMPT | Prompt used to generate class data, recommended to be similar to $PROMPT. |
126
+
127
+ Here is a case command to run Mist V2 on GPU:
128
+
129
+ ```bash
130
+ accelerate launch attacks/mist.py --cuda --low_vram_mode --instance_data_dir data/training --output_dir output/ --class_data_dir data/class --instance_prompt "a photo of a misted person, high quality, masterpiece" --class_prompt "a photo of a person, high quality, masterpiece" --mixed_precision bf16
131
+ ```
132
+
133
+ We also provide a WebUI with the help of [Gradio](https://www.gradio.app/). To boost the WebUI, run:
134
+
135
+ ```bash
136
+ python mist-webui.py
137
+ ```
138
+
139
+ ### Evaluation
140
+
141
+ We provide a simple pipeline to evaluate the output adversarial examples (only for GPU users).
142
+ Basically, this pipeline trains a LoRA on the adversarial examples and samples images with the LoRA.
143
+ Note that our adversarial examples may induce LoRA to output images with NSFW contents
144
+ (for example, chaotic texture). As stated, this is to prevent LoRA training on unauthorized image data. To evaluate the effectiveness of our method, we disable the safety checker in the LoRA sampling script. Following is the instruction to run the pipeline.
145
+
146
+ First, train a LoRA on the output adversarial examples.
147
+
148
+ ```bash
149
+ accelerate launch eval/train_dreambooth_lora_15.py --instance_data_dir=$LORA_INPUT_DIR --output_dir=$LORA_OUTPUT_DIR --class_data_dir=$LORA_CLASS_DIR --instance_prompt $LORA_PROMPT --class_prompt $LORA_CLASS_PROMPT --resolution=512 --train_batch_size=1 --learning_rate=1e-4 --scale_lr --max_train_steps=2000
150
+ ```
151
+
152
+ The parameters are demonstrated in the following table:
153
+
154
+
155
+ | Parameter | Explanation |
156
+ | ------------------ | ---------------------------------------------------------------------------------------------------------- |
157
+ | $LORA_INPUT_DIR | Directory of training data (adversarial examples), staying the same as $OUTPUT_DIR in the previous table. |
158
+ | $LORA_OUTPUT_DIR | Directory to store the trained LoRA. |
159
+ | $LORA_CLASS_DIR | Directory for class data in prior preserved training of Dreambooth, required to be empty. |
160
+ | $LORA_PROMPT | Prompt that describes the training data, used to train the LoRA. |
161
+ | $LORA_CLASS_PROMPT | Prompt used to generate class data, recommended to be related to $LORA_PROMPT. |
162
+
163
+
164
+ Next, open the `eval/sample_lora_15.ipynb` and run the first block. After that, change the value of the variable `LORA_OUTPUT_DIR` to be the previous `$LORA_OUTPUT_DIR` when training the LoRA.
165
+
166
+ ```Python
167
+ from lora_diffusion import tune_lora_scale, patch_pipe
168
+ torch.manual_seed(time.time())
169
+
170
+ # The directory of LoRA
171
+ LORA_OUTPUT_DIR = [The value of $LORA_OUTPUT_DIR]
172
+ ...
173
+ ```
174
+
175
+ Finally, run the second block to see the output and evaluate the performance of Mist.
176
+
177
+
178
+ ## A Glimpse to Methodology
179
+
180
+ Mist V2 works by adversarially attacking generative diffusion models. Basically, the attacking is an optimization over the following objective:
181
+
182
+ $$ \underset{x'}{min} \mathbb{E} {(z_0', \epsilon,t)} \Vert \epsilon_\theta(z'_t(z'_0,\epsilon),t)-z_0^T\Vert^2_2, \Vert x'-x\Vert\leq\zeta$$
183
+
184
+ We demonstrate the notation in the following table.
185
+
186
+ | Variable | Explanation |
187
+ | ----------------- | ---------------------------------------------------------------- |
188
+ | $x$ / $x'$ | The clean image / The adversarial example |
189
+ | $t$ | Time step in the diffusion model. |
190
+ | $z'_0$ | The latent variable of $x'$ in the 0th time step |
191
+ | $\epsilon$ | A standard Gaussian noise |
192
+ | $z_0^T$ | The latent variable of a target image $x^T$ in the 0th time step |
193
+ | $\epsilon_\theta$ | The noise predictor (U-Net) in the diffusion model |
194
+ | $\zeta$ | The budget of adversarial noise |
195
+
196
+
197
+ Intuitively, we find that pushing the output of the U-Net in the diffusion model to the 0th timestep
198
+ latent variable of a target image can effectively confuse the diffusion model. This abstracts the
199
+ aforementioned objective of Mist V2.
200
+
201
+ Our paper is still in working. We are trying to reveal the mechanism behind our method in the paper. Despite of this, you can access [Arxiv]() to view the first draft of our paper.
202
+
203
+ ## License
204
+
205
+ This project is licensed under the [GPL-3.0 license](https://github.com/mist-project/mist/blob/main/LICENSE).
206
+
207
+
208
+ ## Citation
209
+ If you find our work valuable and utilize it, we kindly request that you cite our paper.
210
+
211
+ ```
212
+ @article{zheng2023understanding,
213
+ title={Understanding and Improving Adversarial Attacks on Latent Diffusion Model},
214
+ author={Zheng, Boyang and Liang, Chumeng and Wu, Xiaoyu and Liu, Yan},
215
+ journal={arXiv preprint arXiv:2310.04687},
216
+ year={2023}
217
+ }
218
+ ```
219
+
220
+ Our repository also refers to following papers:
221
+
222
+ ```
223
+ @inproceedings{van2023anti,
224
+ title={Anti-DreamBooth: Protecting users from personalized text-to-image synthesis},
225
+ author={Van Le, Thanh and Phung, Hao and Nguyen, Thuan Hoang and Dao, Quan and Tran, Ngoc N and Tran, Anh},
226
+ booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
227
+ pages={2116--2127},
228
+ year={2023}
229
+ }
230
+ ```
231
+
232
+ ```
233
+ @article{liang2023mist,
234
+ title={Mist: Towards Improved Adversarial Examples for Diffusion Models},
235
+ author={Liang, Chumeng and Wu, Xiaoyu},
236
+ journal={arXiv preprint arXiv:2305.12683},
237
+ year={2023}
238
+ }
239
+ ```
240
+
241
+
242
+
243
 
 
assets/MIST_V2_LOGO.png ADDED
assets/effect_show.png ADDED
assets/output_image.png ADDED
assets/output_image_box.png ADDED
assets/robustness.png ADDED

Git LFS Details

  • SHA256: 5605609933bfac9a71d68c5072afc2689d3ac80d89a54e3a6f2d09c203057ef1
  • Pointer size: 132 Bytes
  • Size of remote file: 2.11 MB
assets/user_2.jpg ADDED

Git LFS Details

  • SHA256: dc39fd3f114ff2e3bfe138bf64b43a3c967be0cb87f69b08202906e6edbb7b4f
  • Pointer size: 133 Bytes
  • Size of remote file: 11.1 MB
assets/user_case_1.png ADDED

Git LFS Details

  • SHA256: dd427e23cd5c982ed80bb9c66044dc79ca75300d8ed07aceb215287334b42389
  • Pointer size: 132 Bytes
  • Size of remote file: 1.84 MB
assets/user_case_2.png ADDED

Git LFS Details

  • SHA256: 1370e22afabe2be7f06dd82defc15a34cce535e08d9a151e2d9f63fbe2744de2
  • Pointer size: 132 Bytes
  • Size of remote file: 1.7 MB
attacks/mist.py ADDED
@@ -0,0 +1,1156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import copy
3
+ import hashlib
4
+ import itertools
5
+ import logging
6
+ import os
7
+ import sys
8
+ import gc
9
+ from pathlib import Path
10
+ from colorama import Fore, Style, init,Back
11
+ import random, time
12
+ '''some system level settings'''
13
+ init(autoreset=True)
14
+ sys.path.insert(0, sys.path[0]+"/../")
15
+ import lpips
16
+
17
+ import datasets
18
+ import diffusers
19
+ import numpy as np
20
+ import torch
21
+ import torch.nn.functional as F
22
+ import torch.utils.checkpoint
23
+ import transformers
24
+ from accelerate import Accelerator
25
+ from accelerate.logging import get_logger
26
+ from accelerate.utils import set_seed
27
+ from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel,DDIMScheduler
28
+ from diffusers.utils.import_utils import is_xformers_available
29
+ from PIL import Image
30
+ from torch.utils.data import Dataset
31
+ from torchvision import transforms
32
+ from tqdm.auto import tqdm
33
+ from transformers import AutoTokenizer, PretrainedConfig
34
+ from torch import autograd
35
+ from typing import Optional, Tuple
36
+ import pynvml
37
+ # from utils import print_tensor
38
+
39
+ from lora_diffusion import (
40
+ extract_lora_ups_down,
41
+ inject_trainable_lora,
42
+ )
43
+ from lora_diffusion.xformers_utils import set_use_memory_efficient_attention_xformers
44
+ from attacks.utils import LatentAttack
45
+
46
+ logger = get_logger(__name__)
47
+
48
+ def parse_args(input_args=None):
49
+ parser = argparse.ArgumentParser(description="Simple example of a training script.")
50
+ parser.add_argument(
51
+ "--cuda",
52
+ action="store_true",
53
+ help="Use gpu for attack",
54
+ )
55
+ parser.add_argument(
56
+ "--pretrained_model_name_or_path",
57
+ "-p",
58
+ type=str,
59
+ default="./stable-diffusion/stable-diffusion-1-5",
60
+ required=False,
61
+ help="Path to pretrained model or model identifier from huggingface.co/models.",
62
+ )
63
+ parser.add_argument(
64
+ "--revision",
65
+ type=str,
66
+ default=None,
67
+ required=False,
68
+ help=(
69
+ "Revision of pretrained model identifier from huggingface.co/models. Trainable model components should be"
70
+ " float32 precision."
71
+ ),
72
+ )
73
+ parser.add_argument(
74
+ "--tokenizer_name",
75
+ type=str,
76
+ default=None,
77
+ help="Pretrained tokenizer name or path if not the same as model_name",
78
+ )
79
+ parser.add_argument(
80
+ "--instance_data_dir",
81
+ type=str,
82
+ default="",
83
+ required=False,
84
+ help="A folder containing the images to add adversarial noise",
85
+ )
86
+ parser.add_argument(
87
+ "--class_data_dir",
88
+ type=str,
89
+ default="",
90
+ required=False,
91
+ help="A folder containing the training data of class images.",
92
+ )
93
+ parser.add_argument(
94
+ "--instance_prompt",
95
+ type=str,
96
+ default="a picture",
97
+ required=False,
98
+ help="The prompt with identifier specifying the instance",
99
+ )
100
+ parser.add_argument(
101
+ "--class_prompt",
102
+ type=str,
103
+ default="a picture",
104
+ help="The prompt to specify images in the same class as provided instance images.",
105
+ )
106
+ parser.add_argument(
107
+ "--with_prior_preservation",
108
+ default=True,
109
+ help="Flag to add prior preservation loss.",
110
+ )
111
+ parser.add_argument(
112
+ "--prior_loss_weight",
113
+ type=float,
114
+ default=0.1,
115
+ help="The weight of prior preservation loss.",
116
+ )
117
+ parser.add_argument(
118
+ "--num_class_images",
119
+ type=int,
120
+ default=50,
121
+ help=(
122
+ "Minimal class images for prior preservation loss. If there are not enough images already present in"
123
+ " class_data_dir, additional images will be sampled with class_prompt."
124
+ ),
125
+ )
126
+ parser.add_argument(
127
+ "--output_dir",
128
+ type=str,
129
+ default="",
130
+ help="The output directory where the perturbed data is stored",
131
+ )
132
+ parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
133
+ parser.add_argument(
134
+ "--resolution",
135
+ type=int,
136
+ default=512,
137
+ help=(
138
+ "The resolution for input images, all the images in the train/validation dataset will be resized to this"
139
+ " resolution"
140
+ ),
141
+ )
142
+ parser.add_argument(
143
+ "--center_crop",
144
+ default=True,
145
+ help=(
146
+ "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
147
+ " cropped. The images will be resized to the resolution first before cropping."
148
+ ),
149
+ )
150
+ parser.add_argument(
151
+ "--train_text_encoder",
152
+ action="store_false",
153
+ help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
154
+ )
155
+ parser.add_argument(
156
+ "--train_batch_size",
157
+ type=int,
158
+ default=1,
159
+ help="Batch size (per device) for the training dataloader.",
160
+ )
161
+ parser.add_argument(
162
+ "--sample_batch_size",
163
+ type=int,
164
+ default=1,
165
+ help="Batch size (per device) for sampling images.",
166
+ )
167
+ parser.add_argument(
168
+ "--max_train_steps",
169
+ type=int,
170
+ default=5,
171
+ help="Total number of training steps to perform.",
172
+ )
173
+ parser.add_argument(
174
+ "--max_f_train_steps",
175
+ type=int,
176
+ default=10,
177
+ help="Total number of sub-steps to train surogate model.",
178
+ )
179
+ parser.add_argument(
180
+ "--max_adv_train_steps",
181
+ type=int,
182
+ default=30,
183
+ help="Total number of sub-steps to train adversarial noise.",
184
+ )
185
+ parser.add_argument(
186
+ "--gradient_accumulation_steps",
187
+ type=int,
188
+ default=1,
189
+ help="Number of updates steps to accumulate before performing a backward/update pass.",
190
+ )
191
+ parser.add_argument(
192
+ "--checkpointing_iterations",
193
+ type=int,
194
+ default=5,
195
+ help=("Save a checkpoint of the training state every X iterations."),
196
+ )
197
+
198
+ parser.add_argument(
199
+ "--logging_dir",
200
+ type=str,
201
+ default="logs",
202
+ help=(
203
+ "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
204
+ " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
205
+ ),
206
+ )
207
+ parser.add_argument(
208
+ "--allow_tf32",
209
+ action="store_true",
210
+ help=(
211
+ "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
212
+ " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
213
+ ),
214
+ )
215
+ parser.add_argument(
216
+ "--report_to",
217
+ type=str,
218
+ default="tensorboard",
219
+ help=(
220
+ 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
221
+ ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
222
+ ),
223
+ )
224
+ parser.add_argument(
225
+ "--mixed_precision",
226
+ type=str,
227
+ default="bf16",
228
+ choices=["no", "fp16", "bf16"],
229
+ help=(
230
+ "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
231
+ " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the"
232
+ " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
233
+ ),
234
+ )
235
+ parser.add_argument(
236
+ "--low_vram_mode",
237
+ action="store_false",
238
+ help="Whether or not to use low vram mode.",
239
+ )
240
+ parser.add_argument(
241
+ "--pgd_alpha",
242
+ type=float,
243
+ default=5e-3,
244
+ help="The step size for pgd.",
245
+ )
246
+ parser.add_argument(
247
+ "--pgd_eps",
248
+ type=float,
249
+ default=float(8.0/255.0),
250
+ help="The noise budget for pgd.",
251
+ )
252
+ parser.add_argument(
253
+ "--lpips_bound",
254
+ type=float,
255
+ default=0.1,
256
+ help="The noise budget for pgd.",
257
+ )
258
+ parser.add_argument(
259
+ "--lpips_weight",
260
+ type=float,
261
+ default=0.5,
262
+ help="The noise budget for pgd.",
263
+ )
264
+ parser.add_argument(
265
+ "--fused_weight",
266
+ type=float,
267
+ default=1e-5,
268
+ help="The decay of alpha and eps when applying pre_attack",
269
+ )
270
+ parser.add_argument(
271
+ "--target_image_path",
272
+ default="data/MIST.png",
273
+ help="target image for attacking",
274
+ )
275
+
276
+ parser.add_argument(
277
+ "--lora_rank",
278
+ type=int,
279
+ default=4,
280
+ help="Rank of LoRA approximation.",
281
+ )
282
+ parser.add_argument(
283
+ "--learning_rate",
284
+ type=float,
285
+ default=1e-4,
286
+ help="Initial learning rate (after the potential warmup period) to use.",
287
+ )
288
+ parser.add_argument(
289
+ "--learning_rate_text",
290
+ type=float,
291
+ default=5e-6,
292
+ help="Initial learning rate for text encoder (after the potential warmup period) to use.",
293
+ )
294
+ parser.add_argument(
295
+ "--scale_lr",
296
+ action="store_true",
297
+ default=False,
298
+ help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
299
+ )
300
+ parser.add_argument(
301
+ "--lr_scheduler",
302
+ type=str,
303
+ default="constant",
304
+ help=(
305
+ 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
306
+ ' "constant", "constant_with_warmup"]'
307
+ ),
308
+ )
309
+ parser.add_argument(
310
+ "--mode",
311
+ type=str,
312
+ choices=['lunet','fused', 'anti-db'],
313
+ default='lunet',
314
+ help="The mode of attack",
315
+ )
316
+ parser.add_argument(
317
+ "--constraint",
318
+ type=str,
319
+ choices=['eps','lpips'],
320
+ default='eps',
321
+ help="The constraint of attack",
322
+ )
323
+ parser.add_argument(
324
+ "--use_8bit_adam",
325
+ action="store_true",
326
+ help="Whether or not to use 8-bit Adam from bitsandbytes.",
327
+ )
328
+ parser.add_argument(
329
+ "--adam_beta1",
330
+ type=float,
331
+ default=0.9,
332
+ help="The beta1 parameter for the Adam optimizer.",
333
+ )
334
+ parser.add_argument(
335
+ "--adam_beta2",
336
+ type=float,
337
+ default=0.999,
338
+ help="The beta2 parameter for the Adam optimizer.",
339
+ )
340
+ parser.add_argument(
341
+ "--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use."
342
+ )
343
+ parser.add_argument(
344
+ "--adam_epsilon",
345
+ type=float,
346
+ default=1e-08,
347
+ help="Epsilon value for the Adam optimizer",
348
+ )
349
+ parser.add_argument(
350
+ "--max_grad_norm", default=1.0, type=float, help="Max gradient norm."
351
+ )
352
+
353
+ parser.add_argument(
354
+ "--local_rank",
355
+ type=int,
356
+ default=-1,
357
+ help="For distributed training: local_rank",
358
+ )
359
+ parser.add_argument(
360
+ "--resume_unet",
361
+ type=str,
362
+ default=None,
363
+ help=("File path for unet lora to resume training."),
364
+ )
365
+ parser.add_argument(
366
+ "--resume_text_encoder",
367
+ type=str,
368
+ default=None,
369
+ help=("File path for text encoder lora to resume training."),
370
+ )
371
+ parser.add_argument(
372
+ "--resize",
373
+ action='store_true',
374
+ required=False,
375
+ help="Should images be resized to --resolution after attacking?",
376
+ )
377
+
378
+
379
+ if input_args is not None:
380
+ args = parser.parse_args(input_args)
381
+ else:
382
+ args = parser.parse_args()
383
+ if args.output_dir != "":
384
+ if not os.path.exists(args.output_dir):
385
+ os.makedirs(args.output_dir,exist_ok=True)
386
+ print(Back.BLUE+Fore.GREEN+'create output dir: {}'.format(args.output_dir))
387
+ return args
388
+
389
+
390
+ class DreamBoothDatasetFromTensor(Dataset):
391
+ """Just like DreamBoothDataset, but take instance_images_tensor instead of path"""
392
+
393
+ def __init__(
394
+ self,
395
+ instance_images_tensor,
396
+ prompts,
397
+ instance_prompt,
398
+ tokenizer,
399
+ class_data_root=None,
400
+ class_prompt=None,
401
+ size=512,
402
+ center_crop=False,
403
+ ):
404
+ self.size = size
405
+ self.center_crop = center_crop
406
+ self.tokenizer = tokenizer
407
+
408
+ self.instance_images_tensor = instance_images_tensor
409
+ self.instance_prompts = prompts
410
+ self.num_instance_images = len(self.instance_images_tensor)
411
+ self.instance_prompt = instance_prompt
412
+ self._length = self.num_instance_images
413
+
414
+ if class_data_root is not None:
415
+ self.class_data_root = Path(class_data_root)
416
+ self.class_data_root.mkdir(parents=True, exist_ok=True)
417
+ self.class_images_path = list(self.class_data_root.iterdir())
418
+ self.num_class_images = len(self.class_images_path)
419
+ # self._length = max(self.num_class_images, self.num_instance_images)
420
+ self.class_prompt = class_prompt
421
+ else:
422
+ self.class_data_root = None
423
+
424
+ self.image_transforms = transforms.Compose(
425
+ [
426
+ transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
427
+ transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
428
+ transforms.ToTensor(),
429
+ transforms.Normalize([0.5], [0.5]),
430
+ ]
431
+ )
432
+
433
+ def __len__(self):
434
+ return self._length
435
+
436
+ def __getitem__(self, index):
437
+ example = {}
438
+ instance_image = self.instance_images_tensor[index % self.num_instance_images]
439
+ instance_prompt = self.instance_prompts[index % self.num_instance_images]
440
+ if instance_prompt == None:
441
+ instance_prompt = self.instance_prompt
442
+ instance_prompt = \
443
+ 'masterpiece,best quality,extremely detailed CG unity 8k wallpaper,illustration,cinematic lighting,beautiful detailed glow' + instance_prompt
444
+ example["instance_images"] = instance_image
445
+ example["instance_prompt_ids"] = self.tokenizer(
446
+ instance_prompt,
447
+ truncation=True,
448
+ padding="max_length",
449
+ max_length=self.tokenizer.model_max_length,
450
+ return_tensors="pt",
451
+ ).input_ids
452
+
453
+ if self.class_data_root:
454
+ class_image = Image.open(self.class_images_path[index % self.num_class_images])
455
+ if not class_image.mode == "RGB":
456
+ class_image = class_image.convert("RGB")
457
+ example["class_images"] = self.image_transforms(class_image)
458
+ example["class_prompt_ids"] = self.tokenizer(
459
+ self.class_prompt,
460
+ truncation=True,
461
+ padding="max_length",
462
+ max_length=self.tokenizer.model_max_length,
463
+ return_tensors="pt",
464
+ ).input_ids
465
+
466
+ return example
467
+
468
+
469
+ def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
470
+ text_encoder_config = PretrainedConfig.from_pretrained(
471
+ pretrained_model_name_or_path,
472
+ subfolder="text_encoder",
473
+ revision=revision,
474
+ )
475
+ model_class = text_encoder_config.architectures[0]
476
+
477
+ if model_class == "CLIPTextModel":
478
+ from transformers import CLIPTextModel
479
+
480
+ return CLIPTextModel
481
+ elif model_class == "RobertaSeriesModelWithTransformation":
482
+ from diffusers.pipelines.alt_diffusion.modeling_roberta_series import RobertaSeriesModelWithTransformation
483
+
484
+ return RobertaSeriesModelWithTransformation
485
+ else:
486
+ raise ValueError(f"{model_class} is not supported.")
487
+
488
+
489
+ class PromptDataset(Dataset):
490
+ "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
491
+
492
+ def __init__(self, prompt, num_samples):
493
+ self.prompt = prompt
494
+ self.num_samples = num_samples
495
+
496
+ def __len__(self):
497
+ return self.num_samples
498
+
499
+ def __getitem__(self, index):
500
+ example = {}
501
+ example["prompt"] = self.prompt
502
+ example["index"] = index
503
+ return example
504
+
505
+
506
+ def load_data(data_dir, size=512, center_crop=True) -> torch.Tensor:
507
+ image_transforms = transforms.Compose(
508
+ [
509
+ transforms.Resize((size,size), interpolation=transforms.InterpolationMode.BILINEAR),
510
+ # transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
511
+ # transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
512
+ transforms.ToTensor(),
513
+ transforms.Normalize([0.5], [0.5]),
514
+ ]
515
+ )
516
+
517
+ # load images & prompts
518
+ images, prompts = [], []
519
+ num_image = 0
520
+ for filename in os.listdir(data_dir):
521
+ if filename.endswith(".png") or filename.endswith(".jpg"):
522
+ file_path = os.path.join(data_dir, filename)
523
+ images.append(Image.open(file_path).convert("RGB"))
524
+ num_image += 1
525
+
526
+ prompt_name = filename[:-3] + 'txt'
527
+ prompt_path = os.path.join(data_dir, prompt_name)
528
+ if os.path.exists(prompt_path):
529
+ with open(prompt_path, "r") as file:
530
+ text_string = file.read()
531
+ prompts.append(text_string)
532
+ print("==load image {} from {}, prompt: {}==".format(num_image-1, file_path, text_string))
533
+ else:
534
+ prompts.append(None)
535
+ print("==load image {} from {}, prompt: None, args.instance_prompt used==".format(num_image-1, file_path))
536
+
537
+ # load sizes
538
+ sizes = [img.size for img in images]
539
+
540
+ # preprocess images
541
+ images = [image_transforms(img) for img in images]
542
+ images = torch.stack(images)
543
+ print("==tensor shape: {}==".format(images.shape))
544
+
545
+ return images, prompts, sizes
546
+
547
+
548
+ def train_one_epoch(
549
+ args,
550
+ accelerator,
551
+ models,
552
+ tokenizer,
553
+ noise_scheduler,
554
+ vae,
555
+ data_tensor: torch.Tensor,
556
+ prompts,
557
+ weight_dtype=torch.bfloat16,
558
+ ):
559
+ # prepare training data
560
+ train_dataset = DreamBoothDatasetFromTensor(
561
+ data_tensor,
562
+ prompts,
563
+ args.instance_prompt,
564
+ tokenizer,
565
+ args.class_data_dir,
566
+ args.class_prompt,
567
+ args.resolution,
568
+ args.center_crop,
569
+ )
570
+
571
+ device = accelerator.device
572
+
573
+ # prepare models & inject lora layers
574
+ unet, text_encoder = copy.deepcopy(models[0]), copy.deepcopy(models[1])
575
+ vae.to(device, dtype=weight_dtype)
576
+ vae.requires_grad_(False)
577
+ text_encoder.to(device, dtype=weight_dtype)
578
+ unet.to(device, dtype=weight_dtype)
579
+ if args.low_vram_mode:
580
+ set_use_memory_efficient_attention_xformers(unet,True)
581
+
582
+ # this is only done at the first epoch
583
+ unet_lora_params, _ = inject_trainable_lora(
584
+ unet, r=args.lora_rank, loras=args.resume_unet
585
+ )
586
+ if args.train_text_encoder:
587
+ text_encoder_lora_params, _ = inject_trainable_lora(
588
+ text_encoder,
589
+ target_replace_module=["CLIPAttention"],
590
+ r=args.lora_rank,
591
+ )
592
+ # for _up, _down in extract_lora_ups_down(
593
+ # text_encoder, target_replace_module=["CLIPAttention"]
594
+ # ):
595
+ # print("Before training: text encoder First Layer lora up", _up.weight.data)
596
+ # print(
597
+ # "Before training: text encoder First Layer lora down", _down.weight.data
598
+ # )
599
+ # break
600
+
601
+ # build the optimizer
602
+ optimizer_class = torch.optim.AdamW
603
+
604
+ text_lr = (
605
+ args.learning_rate
606
+ if args.learning_rate_text is None
607
+ else args.learning_rate_text
608
+ )
609
+
610
+ params_to_optimize = (
611
+ [
612
+ {
613
+ "params": itertools.chain(*unet_lora_params),
614
+ "lr": args.learning_rate},
615
+ {
616
+ "params": itertools.chain(*text_encoder_lora_params),
617
+ "lr": text_lr,
618
+ },
619
+ ]
620
+ if args.train_text_encoder
621
+ else itertools.chain(*unet_lora_params)
622
+ )
623
+
624
+ optimizer = optimizer_class(
625
+ params_to_optimize,
626
+ lr=args.learning_rate,
627
+ betas=(args.adam_beta1, args.adam_beta2),
628
+ weight_decay=args.adam_weight_decay,
629
+ eps=args.adam_epsilon,
630
+ )
631
+
632
+ # begin training
633
+ for step in range(args.max_f_train_steps):
634
+ unet.train()
635
+ text_encoder.train()
636
+
637
+ random.seed(time.time())
638
+ instance_idx = random.randint(0, len(train_dataset)-1)
639
+ step_data = train_dataset[instance_idx]
640
+ pixel_values = torch.stack([step_data["instance_images"], step_data["class_images"]])
641
+ #print("pixel_values shape: {}".format(pixel_values.shape))
642
+ input_ids = torch.cat([step_data["instance_prompt_ids"], step_data["class_prompt_ids"]], dim=0).to(device)
643
+ for k in range(pixel_values.shape[0]):
644
+ #calculate loss of instance and class seperately
645
+ pixel_value = pixel_values[k, :].unsqueeze(0).to(device, dtype=weight_dtype)
646
+ latents = vae.encode(pixel_value).latent_dist.sample().detach().clone()
647
+ latents = latents * vae.config.scaling_factor
648
+ # Sample noise that we'll add to the latents
649
+ noise = torch.randn_like(latents)
650
+ bsz = latents.shape[0]
651
+ # Sample a random timestep for each image
652
+ timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
653
+ timesteps = timesteps.long()
654
+ # Add noise to the latents according to the noise magnitude at each timestep
655
+ # (this is the forward diffusion process)
656
+ noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
657
+ # encode text
658
+ input_id = input_ids[k, :].unsqueeze(0)
659
+ encode_hidden_states = text_encoder(input_id)[0]
660
+ # Get the target for loss depending on the prediction type
661
+ if noise_scheduler.config.prediction_type == "epsilon":
662
+ target = noise
663
+ elif noise_scheduler.config.prediction_type == "v_prediction":
664
+ target = noise_scheduler.get_velocity(latents, noise, timesteps)
665
+ else:
666
+ raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
667
+ model_pred= unet(noisy_latents, timesteps, encode_hidden_states).sample
668
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
669
+ if k == 1:
670
+ # calculate loss of class(prior)
671
+ loss *= args.prior_loss_weight
672
+ loss.backward()
673
+ if k == 1:
674
+ print(f"==loss - image index {instance_idx}, loss: {loss.detach().item() / args.prior_loss_weight}, prior")
675
+ else:
676
+ print(f"==loss - image index {instance_idx}, loss: {loss.detach().item()}, instance")
677
+
678
+ params_to_clip = (
679
+ itertools.chain(unet.parameters(), text_encoder.parameters())
680
+ if args.train_text_encoder
681
+ else unet.parameters()
682
+ )
683
+ torch.nn.utils.clip_grad_norm_(params_to_clip, 1.0, error_if_nonfinite=True)
684
+ optimizer.step()
685
+ optimizer.zero_grad()
686
+
687
+ return [unet, text_encoder]
688
+
689
+
690
+
691
+ def pgd_attack(
692
+ args,
693
+ accelerator,
694
+ models,
695
+ tokenizer,
696
+ noise_scheduler:DDIMScheduler,
697
+ vae:AutoencoderKL,
698
+ data_tensor: torch.Tensor,
699
+ original_images: torch.Tensor,
700
+ target_tensor: torch.Tensor,
701
+ weight_dtype = torch.bfloat16,
702
+ ):
703
+ """Return new perturbed data"""
704
+
705
+ num_steps = args.max_adv_train_steps
706
+
707
+ unet, text_encoder = models
708
+ device = accelerator.device
709
+ if args.constraint == 'lpips':
710
+ lpips_vgg = lpips.LPIPS(net='vgg')
711
+
712
+ vae.to(device, dtype=weight_dtype)
713
+ text_encoder.to(device, dtype=weight_dtype)
714
+ unet.to(device, dtype=weight_dtype)
715
+ if args.low_vram_mode:
716
+ unet.set_use_memory_efficient_attention_xformers(True)
717
+ vae.requires_grad_(False)
718
+ text_encoder.requires_grad_(False)
719
+ unet.requires_grad_(False)
720
+ data_tensor = data_tensor.detach().clone()
721
+ num_image = len(data_tensor)
722
+ image_list = []
723
+ tbar = tqdm(range(num_image))
724
+ tbar.set_description("PGD attack")
725
+ for id in range(num_image):
726
+ tbar.update(1)
727
+ perturbed_image = data_tensor[id, :].unsqueeze(0)
728
+ perturbed_image.requires_grad = True
729
+ original_image = original_images[id, :].unsqueeze(0)
730
+ input_ids = tokenizer(
731
+ args.instance_prompt,
732
+ truncation=True,
733
+ padding="max_length",
734
+ max_length=tokenizer.model_max_length,
735
+ return_tensors="pt",
736
+ ).input_ids
737
+ input_ids = input_ids.to(device)
738
+ for step in range(num_steps):
739
+ perturbed_image.requires_grad = False
740
+ with torch.no_grad():
741
+ latents = vae.encode(perturbed_image.to(device, dtype=weight_dtype)).latent_dist.mean
742
+ #offload vae
743
+ latents = latents.detach().clone()
744
+ latents.requires_grad = True
745
+ latents = latents * vae.config.scaling_factor
746
+
747
+ # Sample noise that we'll add to the latents
748
+ noise = torch.randn_like(latents)
749
+ bsz = latents.shape[0]
750
+ # Sample a random timestep for each image
751
+ timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
752
+ timesteps = timesteps.long()
753
+
754
+ # Add noise to the latents according to the noise magnitude at each timestep
755
+ # (this is the forward diffusion process)
756
+ noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
757
+
758
+ # Get the text embedding for conditioning
759
+ encoder_hidden_states = text_encoder(input_ids)[0]
760
+
761
+ # Predict the noise residual
762
+ model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
763
+
764
+ # Get the target for loss depending on the prediction type
765
+ if noise_scheduler.config.prediction_type == "epsilon":
766
+ target = noise
767
+ elif noise_scheduler.config.prediction_type == "v_prediction":
768
+ target = noise_scheduler.get_velocity(latents, noise, timesteps)
769
+ else:
770
+ raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
771
+
772
+ unet.zero_grad()
773
+ text_encoder.zero_grad()
774
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
775
+
776
+ # target-shift loss
777
+ if target_tensor is not None:
778
+ if args.mode != 'anti-db':
779
+ loss = - F.mse_loss(model_pred, target_tensor)
780
+ # fused mode
781
+ if args.mode == 'fused':
782
+ latent_attack = LatentAttack()
783
+ loss = loss - 1e2 * latent_attack(latents, target_tensor=target_tensor)
784
+
785
+ loss = loss / args.gradient_accumulation_steps
786
+ grads = autograd.grad(loss, latents)[0].detach().clone()
787
+ # now loss is backproped to latents
788
+ #print('grads: {}'.format(grads))
789
+ #do forward on vae again
790
+ perturbed_image.requires_grad = True
791
+ gc_latents = vae.encode(perturbed_image.to(device, dtype=weight_dtype)).latent_dist.mean
792
+ gc_latents.backward(gradient=grads)
793
+
794
+ if step % args.gradient_accumulation_steps == args.gradient_accumulation_steps - 1:
795
+
796
+ if args.constraint == 'eps':
797
+ alpha = args.pgd_alpha
798
+ adv_images = perturbed_image + alpha * perturbed_image.grad.sign()
799
+
800
+ # hard constraint
801
+ eps = args.pgd_eps
802
+ eta = torch.clamp(adv_images - original_image, min=-eps, max=+eps)
803
+ perturbed_image = torch.clamp(original_image + eta, min=-1, max=+1).detach_()
804
+ perturbed_image.requires_grad = True
805
+ elif args.constraint == 'lpips':
806
+ # compute reg loss
807
+ lpips_distance = lpips_vgg(perturbed_image, original_image)
808
+ reg_loss = args.lpips_weight * torch.max(lpips_distance - args.lpips_bound, 0)[0].squeeze()
809
+ reg_loss.backward()
810
+
811
+ alpha = args.pgd_alpha
812
+ adv_images = perturbed_image + alpha * perturbed_image.grad.sign()
813
+
814
+ eta = adv_images - original_image
815
+ perturbed_image = torch.clamp(original_image + eta, min=-1, max=+1).detach_()
816
+ perturbed_image.requires_grad = True
817
+ else:
818
+ raise NotImplementedError
819
+
820
+ #print(f"PGD loss - step {step}, loss: {loss.detach().item()}")
821
+
822
+ image_list.append(perturbed_image.detach().clone().squeeze(0))
823
+ outputs = torch.stack(image_list)
824
+
825
+
826
+ return outputs
827
+
828
+ def main(args):
829
+ if args.cuda:
830
+ try:
831
+ pynvml.nvmlInit()
832
+ handle = pynvml.nvmlDeviceGetHandleByIndex(0)
833
+ mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
834
+ mem_free = mem_info.free / float(1073741824)
835
+ if mem_free < 5.5:
836
+ raise NotImplementedError("Your GPU memory is not enough for running Mist on GPU. Please try CPU mode.")
837
+ except:
838
+ raise NotImplementedError("No GPU found in GPU mode. Please try CPU mode.")
839
+
840
+
841
+ logging_dir = Path(args.output_dir, args.logging_dir)
842
+
843
+ if not args.cuda:
844
+ accelerator = Accelerator(
845
+ mixed_precision=args.mixed_precision,
846
+ log_with=args.report_to,
847
+ project_dir=logging_dir,
848
+ cpu=True
849
+ )
850
+ else:
851
+ accelerator = Accelerator(
852
+ mixed_precision=args.mixed_precision,
853
+ log_with=args.report_to,
854
+ project_dir=logging_dir
855
+ )
856
+
857
+ logging.basicConfig(
858
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
859
+ datefmt="%m/%d/%Y %H:%M:%S",
860
+ level=logging.INFO,
861
+ )
862
+ logger.info(accelerator.state, main_process_only=False)
863
+ if accelerator.is_local_main_process:
864
+ datasets.utils.logging.set_verbosity_warning()
865
+ transformers.utils.logging.set_verbosity_warning()
866
+ diffusers.utils.logging.set_verbosity_info()
867
+ else:
868
+ datasets.utils.logging.set_verbosity_error()
869
+ transformers.utils.logging.set_verbosity_error()
870
+ diffusers.utils.logging.set_verbosity_error()
871
+
872
+ if args.seed is not None:
873
+ set_seed(args.seed)
874
+
875
+ weight_dtype = torch.float32
876
+ if args.cuda:
877
+ if accelerator.mixed_precision == "fp16":
878
+ weight_dtype = torch.float16
879
+ elif accelerator.mixed_precision == "bf16":
880
+ weight_dtype = torch.bfloat16
881
+ print("==precision: {}==".format(weight_dtype))
882
+
883
+ # Generate class images if prior preservation is enabled.
884
+ if args.with_prior_preservation:
885
+ class_images_dir = Path(args.class_data_dir)
886
+ if not class_images_dir.exists():
887
+ class_images_dir.mkdir(parents=True)
888
+ cur_class_images = len(list(class_images_dir.iterdir()))
889
+
890
+ if cur_class_images < args.num_class_images:
891
+ torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
892
+ if args.mixed_precision == "fp32":
893
+ torch_dtype = torch.float32
894
+ elif args.mixed_precision == "fp16":
895
+ torch_dtype = torch.float16
896
+ elif args.mixed_precision == "bf16":
897
+ torch_dtype = torch.bfloat16
898
+ pipeline = DiffusionPipeline.from_pretrained(
899
+ args.pretrained_model_name_or_path,
900
+ torch_dtype=torch_dtype,
901
+ safety_checker=None,
902
+ revision=args.revision,
903
+ )
904
+ pipeline.set_progress_bar_config(disable=True)
905
+
906
+ num_new_images = args.num_class_images - cur_class_images
907
+ logger.info(f"Number of class images to sample: {num_new_images}.")
908
+
909
+ sample_dataset = PromptDataset(args.class_prompt, num_new_images)
910
+ sample_dataloader = torch.utils.data.DataLoader(sample_dataset, batch_size=args.sample_batch_size)
911
+
912
+ sample_dataloader = accelerator.prepare(sample_dataloader)
913
+ pipeline.to(accelerator.device)
914
+
915
+ for example in tqdm(
916
+ sample_dataloader,
917
+ desc="Generating class images",
918
+ disable=not accelerator.is_local_main_process,
919
+ ):
920
+ images = pipeline(example["prompt"]).images
921
+
922
+ for i, image in enumerate(images):
923
+ hash_image = hashlib.sha1(image.tobytes()).hexdigest()
924
+ image_filename = class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
925
+ image.save(image_filename)
926
+
927
+ del pipeline
928
+ if torch.cuda.is_available():
929
+ torch.cuda.empty_cache()
930
+
931
+ # import correct text encoder class
932
+ text_encoder_cls = import_model_class_from_model_name_or_path(args.pretrained_model_name_or_path, args.revision)
933
+
934
+ # Load scheduler and models
935
+ text_encoder = text_encoder_cls.from_pretrained(
936
+ args.pretrained_model_name_or_path,
937
+ subfolder="text_encoder",
938
+ revision=args.revision,
939
+ )
940
+ unet = UNet2DConditionModel.from_pretrained(
941
+ args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
942
+ )
943
+
944
+ # add by lora
945
+ unet.requires_grad_(False)
946
+ # end: added by lora
947
+
948
+ tokenizer = AutoTokenizer.from_pretrained(
949
+ args.pretrained_model_name_or_path,
950
+ subfolder="tokenizer",
951
+ revision=args.revision,
952
+ use_fast=False,
953
+ )
954
+
955
+
956
+ noise_scheduler = DDIMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
957
+ if not args.cuda:
958
+ vae = AutoencoderKL.from_pretrained(
959
+ args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision
960
+ ).cuda()
961
+ else:
962
+ vae = AutoencoderKL.from_pretrained(
963
+ args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision
964
+ )
965
+ vae.to(accelerator.device, dtype=weight_dtype)
966
+ vae.requires_grad_(False)
967
+ vae.encoder.training = True
968
+ vae.encoder.gradient_checkpointing = True
969
+
970
+ #print info about train_text_encoder
971
+
972
+ if not args.train_text_encoder:
973
+ text_encoder.requires_grad_(False)
974
+
975
+ if args.allow_tf32:
976
+ torch.backends.cuda.matmul.allow_tf32 = True
977
+
978
+ perturbed_data, prompts, data_sizes = load_data(
979
+ args.instance_data_dir,
980
+ size=args.resolution,
981
+ center_crop=args.center_crop,
982
+ )
983
+ original_data = perturbed_data.clone()
984
+ original_data.requires_grad_(False)
985
+
986
+
987
+ target_latent_tensor = None
988
+ if args.target_image_path is not None and args.target_image_path != "":
989
+ # print(Style.BRIGHT+Back.BLUE+Fore.GREEN+'load target image from {}'.format(args.target_image_path))
990
+ target_image_path = Path(args.target_image_path)
991
+ assert target_image_path.is_file(), f"Target image path {target_image_path} does not exist"
992
+
993
+ target_image = Image.open(target_image_path).convert("RGB").resize((args.resolution, args.resolution))
994
+ target_image = np.array(target_image)[None].transpose(0, 3, 1, 2)
995
+ if args.cuda:
996
+ target_image_tensor = torch.from_numpy(target_image).to("cuda", dtype=weight_dtype) / 127.5 - 1.0
997
+ else:
998
+ target_image_tensor = torch.from_numpy(target_image).to(dtype=weight_dtype) / 127.5 - 1.0
999
+ target_latent_tensor = (
1000
+ vae.encode(target_image_tensor).latent_dist.sample().to(dtype=weight_dtype) * vae.config.scaling_factor
1001
+ )
1002
+ target_image_tensor = target_image_tensor.to('cpu')
1003
+ del target_image_tensor
1004
+ #target_latent_tensor = target_latent_tensor.repeat(len(perturbed_data), 1, 1, 1).cuda()
1005
+ f = [unet, text_encoder]
1006
+ for i in range(args.max_train_steps):
1007
+ f_sur = copy.deepcopy(f)
1008
+ perturbed_data = pgd_attack(
1009
+ args,
1010
+ accelerator,
1011
+ f_sur,
1012
+ tokenizer,
1013
+ noise_scheduler,
1014
+ vae,
1015
+ perturbed_data,
1016
+ original_data,
1017
+ target_latent_tensor,
1018
+ weight_dtype,
1019
+ )
1020
+ del f_sur
1021
+ if args.cuda:
1022
+ gc.collect()
1023
+ f = train_one_epoch(
1024
+ args,
1025
+ accelerator,
1026
+ f,
1027
+ tokenizer,
1028
+ noise_scheduler,
1029
+ vae,
1030
+ perturbed_data,
1031
+ prompts,
1032
+ weight_dtype,
1033
+ )
1034
+
1035
+ for model in f:
1036
+ if model != None:
1037
+ model.to('cpu')
1038
+
1039
+ if args.cuda:
1040
+ gc.collect()
1041
+ pynvml.nvmlInit()
1042
+ handle = pynvml.nvmlDeviceGetHandleByIndex(0)
1043
+ mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
1044
+ print("=======Epoch {} ends! Memory cost: {}======".format(i, mem_info.used / float(1073741824)))
1045
+ else:
1046
+ print("=======Epoch {} ends!======".format(i))
1047
+
1048
+ if (i + 1) % args.max_train_steps == 0:
1049
+ save_folder = f"{args.output_dir}"
1050
+ os.makedirs(save_folder, exist_ok=True)
1051
+ noised_imgs = perturbed_data.detach().cpu()
1052
+ origin_imgs = original_data.detach().cpu()
1053
+ img_names = []
1054
+ for filename in os.listdir(args.instance_data_dir):
1055
+ if filename.endswith(".png") or filename.endswith(".jpg"):
1056
+ img_names.append(str(filename))
1057
+ for img_pixel, ori_img_pixel, img_name, img_size in zip(noised_imgs, origin_imgs, img_names, data_sizes):
1058
+ save_path = os.path.join(save_folder, f"{i+1}_noise_{img_name}")
1059
+ if not args.resize:
1060
+ Image.fromarray(
1061
+ (img_pixel * 127.5 + 128).clamp(0, 255).to(torch.uint8).permute(1, 2, 0).numpy()
1062
+ ).save(save_path)
1063
+ else:
1064
+ ori_img_path = os.path.join(args.instance_data_dir, img_name)
1065
+ ori_img = np.array(Image.open(ori_img_path).convert("RGB"))
1066
+
1067
+ ori_img_duzzy = np.array(Image.fromarray(
1068
+ (ori_img_pixel * 127.5 + 128).clamp(0, 255).to(torch.uint8).permute(1, 2, 0).numpy()
1069
+ ).resize(img_size), dtype=np.int32)
1070
+ perturbed_img_duzzy = np.array(Image.fromarray(
1071
+ (img_pixel * 127.5 + 128).clamp(0, 255).to(torch.uint8).permute(1, 2, 0).numpy()
1072
+ ).resize(img_size), dtype=np.int32)
1073
+
1074
+ perturbation = perturbed_img_duzzy - ori_img_duzzy
1075
+ assert perturbation.shape == ori_img.shape
1076
+
1077
+ perturbed_img = (ori_img + perturbation).clip(0, 255).astype(np.uint8)
1078
+ # print("perturbation: {}, ori: {}, res: {}".format(
1079
+ # perturbed_img_duzzy[:2, :2, :], ori_img_duzzy[:2, :2, :], perturbed_img_duzzy[:2, :2, :]))
1080
+ Image.fromarray(perturbed_img).save(save_path)
1081
+
1082
+
1083
+ print(f"==Saved misted image to {save_path}, size: {img_size}==")
1084
+ # print(f"Saved noise at step {i+1} to {save_folder}")
1085
+ del noised_imgs
1086
+
1087
+ def update_args_with_config(args, config):
1088
+ '''
1089
+ Update the default augments in args with config assigned by users
1090
+ args list:
1091
+ eps:
1092
+ max train epoch:
1093
+ data path:
1094
+ class path:
1095
+ output path:
1096
+ device:
1097
+ gpu normal,
1098
+ gpu low vram,
1099
+ cpu,
1100
+ mode:
1101
+ lunet, full
1102
+ '''
1103
+
1104
+ args = parse_args()
1105
+ eps, device, mode, resize, data_path, output_path, model_path, class_path, prompt, \
1106
+ class_prompt, max_train_steps, max_f_train_steps, max_adv_train_steps, lora_lr, pgd_lr, \
1107
+ rank, prior_loss_weight, fused_weight, constraint_mode, lpips_bound, lpips_weight = config
1108
+ args.pgd_eps = float(eps)/255.0
1109
+ if device == 'cpu':
1110
+ args.cuda, args.low_vram_mode = False, False
1111
+ else:
1112
+ args.cuda, args.low_vram_mode = True, True
1113
+ # if precision == 'bfloat16':
1114
+ # args.mixed_precision = 'bf16'
1115
+ # else:
1116
+ # args.mixed_precision = 'fp16'
1117
+ if mode == 'Mode 1':
1118
+ args.mode = 'lunet'
1119
+ elif mode == 'Mode 2':
1120
+ args.mode = 'fused'
1121
+ elif mode == 'Mode 3':
1122
+ args.mode = 'anti-db'
1123
+ if resize:
1124
+ args.resize = True
1125
+
1126
+ assert os.path.exists(data_path) and os.path.exists(output_path)
1127
+ args.instance_data_dir = data_path
1128
+ args.output_dir = output_path
1129
+ args.pretrained_model_name_or_path = model_path
1130
+ args.class_data_dir = class_path
1131
+ args.instance_prompt = prompt
1132
+
1133
+ args.class_prompt = class_prompt
1134
+ args.max_train_steps = max_train_steps
1135
+ args.max_f_train_steps = max_f_train_steps
1136
+ args.max_adv_train_steps = max_adv_train_steps
1137
+ args.learning_rate = lora_lr
1138
+ args.pgd_alpha = pgd_lr
1139
+ args.rank = rank
1140
+ args.prior_loss_weight = prior_loss_weight
1141
+ args.fused_weight = fused_weight
1142
+
1143
+ if constraint_mode == 'LPIPS':
1144
+ args.constraint = 'lpips'
1145
+ else:
1146
+ args.constraint = 'eps'
1147
+ args.lpips_bound = lpips_bound
1148
+ args.lpips_weight = lpips_weight
1149
+
1150
+ return args
1151
+
1152
+
1153
+ if __name__ == "__main__":
1154
+ args = parse_args()
1155
+ main(args)
1156
+
attacks/utils.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+
4
+ prompt_dataset = [
5
+ "Portrait of an astronaut in space, detailed starry background, reflective helmet,",
6
+ "Painting of a floating island with giant clock gears, populated with mythical creatures,",
7
+ "Landscape of a Japanese garden in autumn, with a bridge over a koi pond,",
8
+ "Painting representing the sound of jazz music, using vibrant colors and erratic shapes,",
9
+ "Painting of a modern smartphone with classic art pieces appearing on the screen,",
10
+ "Battle scene with futuristic robots and a golden palace in the background,",
11
+ "Scene of a bustling city market with different perspectives of people and stalls,",
12
+ "Scene of a ship sailing in a stormy sea, with dramatic lighting and powerful waves,",
13
+ "Portraint of a female botanist surrounded by exotic plants in a greenhouse,",
14
+ "Painting of an ancient castle at night, with a full moon, gargoyles, and shadows,",
15
+ ]
16
+
17
+ style_dataset = [
18
+ "Art Nouveau",
19
+ "Romantic",
20
+ "Cubist",
21
+ "Baroque",
22
+ "Pop Art",
23
+ "Abstract",
24
+ "Impressionist",
25
+ "Surrealist",
26
+ "Renaissance",
27
+ "Pointillism",
28
+ ]
29
+
30
+
31
+
32
+ class attack_mixin:
33
+ def __call__(
34
+ self,
35
+ latents: torch.Tensor,
36
+ timesteps: torch.Tensor,
37
+ encoder_hidden_states: torch.Tensor,
38
+ unet: torch.nn.Module,
39
+ target_tensor: torch.Tensor,
40
+ noise_scheduler
41
+ ):
42
+ raise NotImplementedError
43
+
44
+ class AdvDM(attack_mixin):
45
+ """
46
+ This attack aims to maximize the training loss of diffusion model
47
+ """
48
+ def __call__(
49
+ self,
50
+ latents: torch.Tensor,
51
+ noise: torch.Tensor,
52
+ timesteps: torch.Tensor,
53
+ encoder_hidden_states: torch.Tensor,
54
+ unet: torch.nn.Module,
55
+ text_encoder: torch.nn.Module,
56
+ input_ids,
57
+ target_tensor: torch.Tensor,
58
+ noise_scheduler
59
+ ):
60
+ noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
61
+
62
+ # Get the text embedding for conditioning
63
+ encoder_hidden_states = text_encoder(input_ids)[0]
64
+
65
+ # Predict the noise residual
66
+ model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
67
+
68
+ # Get the target for loss depending on the prediction type
69
+ if noise_scheduler.config.prediction_type == "epsilon":
70
+ target = noise
71
+ elif noise_scheduler.config.prediction_type == "v_prediction":
72
+ target = noise_scheduler.get_velocity(latents, noise, timesteps)
73
+ else:
74
+ raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
75
+
76
+ unet.zero_grad()
77
+ text_encoder.zero_grad()
78
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
79
+
80
+ # target-shift loss
81
+ if target_tensor is not None:
82
+ xtm1_pred = torch.cat(
83
+ [
84
+ noise_scheduler.step(
85
+ model_pred[idx : idx + 1],
86
+ timesteps[idx : idx + 1],
87
+ noisy_latents[idx : idx + 1],
88
+ ).prev_sample
89
+ for idx in range(len(model_pred))
90
+ ]
91
+ )
92
+ xtm1_target = noise_scheduler.add_noise(target_tensor, noise, timesteps - 1)
93
+ loss = loss - F.mse_loss(xtm1_pred, xtm1_target)
94
+
95
+ return loss
96
+
97
+ class LatentAttack(attack_mixin):
98
+ """
99
+ This attack aims to minimize the l2 distance between latent and target_tensor
100
+ """
101
+ def __call__(
102
+ self,
103
+ latents: torch.Tensor,
104
+ timesteps: torch.Tensor=None,
105
+ encoder_hidden_states: torch.Tensor=None,
106
+ unet: torch.nn.Module=None,
107
+ target_tensor: torch.Tensor=None,
108
+ noise_scheduler=None
109
+ ):
110
+ if target_tensor == None:
111
+ raise ValueError("Need a target tensor for pre-attack")
112
+ loss = - F.mse_loss(latents, target_tensor, reduction="mean")
113
+ return loss
data/MIST.png ADDED
eval/sample_lora_15.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
eval/train_dreambooth_lora_15.py ADDED
@@ -0,0 +1,1007 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Bootstrapped from:
2
+ # https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py
3
+
4
+ import argparse
5
+ import hashlib
6
+ import itertools
7
+ import math
8
+ import os
9
+ import inspect
10
+ from pathlib import Path
11
+ from typing import Optional
12
+
13
+ import torch
14
+ import torch.nn.functional as F
15
+ import torch.utils.checkpoint
16
+ import os
17
+ import sys
18
+ sys.path.insert(0, sys.path[0]+"/../")
19
+ from accelerate import Accelerator
20
+ from accelerate.logging import get_logger
21
+ from accelerate.utils import set_seed
22
+ from diffusers import (
23
+ AutoencoderKL,
24
+ DDPMScheduler,
25
+ StableDiffusionPipeline,
26
+ UNet2DConditionModel,
27
+ )
28
+ from diffusers.optimization import get_scheduler
29
+ from huggingface_hub import HfFolder, Repository, whoami
30
+
31
+ from tqdm.auto import tqdm
32
+ from transformers import CLIPTextModel, CLIPTokenizer
33
+
34
+ from lora_diffusion import (
35
+ extract_lora_ups_down,
36
+ inject_trainable_lora,
37
+ safetensors_available,
38
+ save_lora_weight,
39
+ save_safeloras,
40
+ )
41
+ from lora_diffusion.xformers_utils import set_use_memory_efficient_attention_xformers
42
+ from PIL import Image
43
+ from torch.utils.data import Dataset
44
+ from torchvision import transforms
45
+
46
+ from pathlib import Path
47
+
48
+ import random
49
+ import re
50
+
51
+
52
+ class DreamBoothDataset(Dataset):
53
+ """
54
+ A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
55
+ It pre-processes the images and the tokenizes prompts.
56
+ """
57
+
58
+ def __init__(
59
+ self,
60
+ instance_data_root,
61
+ instance_prompt,
62
+ tokenizer,
63
+ class_data_root=None,
64
+ class_prompt=None,
65
+ size=512,
66
+ center_crop=False,
67
+ color_jitter=False,
68
+ h_flip=False,
69
+ resize=False,
70
+ ):
71
+ self.size = size
72
+ self.center_crop = center_crop
73
+ self.tokenizer = tokenizer
74
+ self.resize = resize
75
+
76
+ self.instance_data_root = Path(instance_data_root)
77
+ if not self.instance_data_root.exists():
78
+ raise ValueError("Instance images root doesn't exists.")
79
+
80
+ self.instance_images_path = []
81
+ for filename in os.listdir(instance_data_root):
82
+ if filename.endswith(".png") or filename.endswith(".jpg"):
83
+ self.instance_images_path.append(os.path.join(instance_data_root, filename))
84
+ self.num_instance_images = len(self.instance_images_path)
85
+ self.instance_prompt = instance_prompt
86
+ self._length = self.num_instance_images
87
+
88
+ if class_data_root is not None:
89
+ self.class_data_root = Path(class_data_root)
90
+ self.class_data_root.mkdir(parents=True, exist_ok=True)
91
+ self.class_images_path = list(self.class_data_root.iterdir())
92
+ self.num_class_images = len(self.class_images_path)
93
+ self._length = max(self.num_class_images, self.num_instance_images)
94
+ self.class_prompt = class_prompt
95
+ else:
96
+ self.class_data_root = None
97
+
98
+ img_transforms = []
99
+
100
+ if resize:
101
+ img_transforms.append(
102
+ transforms.Resize(
103
+ size, interpolation=transforms.InterpolationMode.BILINEAR
104
+ )
105
+ )
106
+ if center_crop:
107
+ img_transforms.append(transforms.CenterCrop(size))
108
+ if color_jitter:
109
+ img_transforms.append(transforms.ColorJitter(0.2, 0.1))
110
+ if h_flip:
111
+ img_transforms.append(transforms.RandomHorizontalFlip())
112
+
113
+ self.image_transforms = transforms.Compose(
114
+ [*img_transforms, transforms.ToTensor(), transforms.Normalize([0.5], [0.5])]
115
+ )
116
+
117
+ def __len__(self):
118
+ return self._length
119
+
120
+ def __getitem__(self, index):
121
+ example = {}
122
+ instance_image = Image.open(
123
+ self.instance_images_path[index % self.num_instance_images]
124
+ )
125
+ if not instance_image.mode == "RGB":
126
+ instance_image = instance_image.convert("RGB")
127
+ example["instance_images"] = self.image_transforms(instance_image)
128
+ example["instance_prompt_ids"] = self.tokenizer(
129
+ self.instance_prompt,
130
+ padding="do_not_pad",
131
+ truncation=True,
132
+ max_length=self.tokenizer.model_max_length,
133
+ ).input_ids
134
+
135
+ if self.class_data_root:
136
+ class_image = Image.open(
137
+ self.class_images_path[index % self.num_class_images]
138
+ )
139
+ if not class_image.mode == "RGB":
140
+ class_image = class_image.convert("RGB")
141
+ example["class_images"] = self.image_transforms(class_image)
142
+ example["class_prompt_ids"] = self.tokenizer(
143
+ self.class_prompt,
144
+ padding="do_not_pad",
145
+ truncation=True,
146
+ max_length=self.tokenizer.model_max_length,
147
+ ).input_ids
148
+
149
+ return example
150
+
151
+
152
+ class PromptDataset(Dataset):
153
+ "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
154
+
155
+ def __init__(self, prompt, num_samples):
156
+ self.prompt = prompt
157
+ self.num_samples = num_samples
158
+
159
+ def __len__(self):
160
+ return self.num_samples
161
+
162
+ def __getitem__(self, index):
163
+ example = {}
164
+ example["prompt"] = self.prompt
165
+ example["index"] = index
166
+ return example
167
+
168
+
169
+ logger = get_logger(__name__)
170
+
171
+
172
+ def parse_args(input_args=None):
173
+ parser = argparse.ArgumentParser(description="Simple example of a training script.")
174
+ parser.add_argument(
175
+ "--pretrained_model_name_or_path",
176
+ type=str,
177
+ default="stable-diffusion/stable-diffusion-1-5",
178
+ help="Path to pretrained model or model identifier from huggingface.co/models.",
179
+ )
180
+ parser.add_argument(
181
+ "--pretrained_vae_name_or_path",
182
+ type=str,
183
+ default=None,
184
+ help="Path to pretrained vae or vae identifier from huggingface.co/models.",
185
+ )
186
+ parser.add_argument(
187
+ "--revision",
188
+ type=str,
189
+ default=None,
190
+ help="Revision of pretrained model identifier from huggingface.co/models.",
191
+ )
192
+ parser.add_argument(
193
+ "--tokenizer_name",
194
+ type=str,
195
+ default=None,
196
+ help="Pretrained tokenizer name or path if not the same as model_name",
197
+ )
198
+ parser.add_argument(
199
+ "--instance_data_dir",
200
+ type=str,
201
+ default="outputs/celeba-20-121/noise-ckpt/5",
202
+ help="A folder containing the training data of instance images.",
203
+ )
204
+ parser.add_argument(
205
+ "--class_data_dir",
206
+ type=str,
207
+ default="data/celeba-20-121",
208
+ required=False,
209
+ help="A folder containing the training data of class images.",
210
+ )
211
+ parser.add_argument(
212
+ "--instance_prompt",
213
+ type=str,
214
+ default="a photo of sks person",
215
+ help="The prompt with identifier specifying the instance",
216
+ )
217
+ parser.add_argument(
218
+ "--class_prompt",
219
+ type=str,
220
+ default="a photo of person",
221
+ help="The prompt to specify images in the same class as provided instance images.",
222
+ )
223
+ parser.add_argument(
224
+ "--with_prior_preservation",
225
+ default=True,
226
+ help="Flag to add prior preservation loss.",
227
+ )
228
+ parser.add_argument(
229
+ "--prior_loss_weight",
230
+ type=float,
231
+ default=1.0,
232
+ help="The weight of prior preservation loss.",
233
+ )
234
+ parser.add_argument(
235
+ "--num_class_images",
236
+ type=int,
237
+ default=100,
238
+ help=(
239
+ "Minimal class images for prior preservation loss. If not have enough images, additional images will be"
240
+ " sampled with class_prompt."
241
+ ),
242
+ )
243
+ parser.add_argument(
244
+ "--output_dir",
245
+ type=str,
246
+ default="lora_repo/model",
247
+ help="The output directory where the model predictions and checkpoints will be written.",
248
+ )
249
+ parser.add_argument(
250
+ "--output_format",
251
+ type=str,
252
+ choices=["pt", "safe", "both"],
253
+ default="both",
254
+ help="The output format of the model predicitions and checkpoints.",
255
+ )
256
+ parser.add_argument(
257
+ "--seed", type=int, default=None, help="A seed for reproducible training."
258
+ )
259
+ parser.add_argument(
260
+ "--resolution",
261
+ type=int,
262
+ default=512,
263
+ help=(
264
+ "The resolution for input images, all the images in the train/validation dataset will be resized to this"
265
+ " resolution"
266
+ ),
267
+ )
268
+ parser.add_argument(
269
+ "--center_crop",
270
+ default=True,
271
+ help="Whether to center crop images before resizing to resolution",
272
+ )
273
+ parser.add_argument(
274
+ "--color_jitter",
275
+ action="store_true",
276
+ help="Whether to apply color jitter to images",
277
+ )
278
+ parser.add_argument(
279
+ "--train_text_encoder",
280
+ default=True,
281
+ help="Whether to train the text encoder",
282
+ )
283
+ parser.add_argument(
284
+ "--train_batch_size",
285
+ type=int,
286
+ default=1,
287
+ help="Batch size (per device) for the training dataloader.",
288
+ )
289
+ parser.add_argument(
290
+ "--sample_batch_size",
291
+ type=int,
292
+ default=4,
293
+ help="Batch size (per device) for sampling images.",
294
+ )
295
+ parser.add_argument("--num_train_epochs", type=int, default=1)
296
+ parser.add_argument(
297
+ "--max_train_steps",
298
+ type=int,
299
+ default=1000,
300
+ help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
301
+ )
302
+ parser.add_argument(
303
+ "--save_steps",
304
+ type=int,
305
+ default=1000,
306
+ help="Save checkpoint every X updates steps.",
307
+ )
308
+ parser.add_argument(
309
+ "--gradient_accumulation_steps",
310
+ type=int,
311
+ default=1,
312
+ help="Number of updates steps to accumulate before performing a backward/update pass.",
313
+ )
314
+ parser.add_argument(
315
+ "--gradient_checkpointing",
316
+ action="store_true",
317
+ help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
318
+ )
319
+ parser.add_argument(
320
+ "--lora_rank",
321
+ type=int,
322
+ default=4,
323
+ help="Rank of LoRA approximation.",
324
+ )
325
+ parser.add_argument(
326
+ "--learning_rate",
327
+ type=float,
328
+ default=1e-4,
329
+ help="Initial learning rate (after the potential warmup period) to use.",
330
+ )
331
+ parser.add_argument(
332
+ "--learning_rate_text",
333
+ type=float,
334
+ default=5e-5,
335
+ help="Initial learning rate for text encoder (after the potential warmup period) to use.",
336
+ )
337
+ parser.add_argument(
338
+ "--scale_lr",
339
+ action="store_true",
340
+ default=False,
341
+ help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
342
+ )
343
+ parser.add_argument(
344
+ "--lr_scheduler",
345
+ type=str,
346
+ default="constant",
347
+ help=(
348
+ 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
349
+ ' "constant", "constant_with_warmup"]'
350
+ ),
351
+ )
352
+ parser.add_argument(
353
+ "--lr_warmup_steps",
354
+ type=int,
355
+ default=500,
356
+ help="Number of steps for the warmup in the lr scheduler.",
357
+ )
358
+ parser.add_argument(
359
+ "--use_8bit_adam",
360
+ action="store_true",
361
+ help="Whether or not to use 8-bit Adam from bitsandbytes.",
362
+ )
363
+ parser.add_argument(
364
+ "--adam_beta1",
365
+ type=float,
366
+ default=0.9,
367
+ help="The beta1 parameter for the Adam optimizer.",
368
+ )
369
+ parser.add_argument(
370
+ "--adam_beta2",
371
+ type=float,
372
+ default=0.999,
373
+ help="The beta2 parameter for the Adam optimizer.",
374
+ )
375
+ parser.add_argument(
376
+ "--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use."
377
+ )
378
+ parser.add_argument(
379
+ "--adam_epsilon",
380
+ type=float,
381
+ default=1e-08,
382
+ help="Epsilon value for the Adam optimizer",
383
+ )
384
+ parser.add_argument(
385
+ "--max_grad_norm", default=1.0, type=float, help="Max gradient norm."
386
+ )
387
+ parser.add_argument(
388
+ "--push_to_hub",
389
+ action="store_true",
390
+ help="Whether or not to push the model to the Hub.",
391
+ )
392
+ parser.add_argument(
393
+ "--hub_token",
394
+ type=str,
395
+ default=None,
396
+ help="The token to use to push to the Model Hub.",
397
+ )
398
+ parser.add_argument(
399
+ "--logging_dir",
400
+ type=str,
401
+ default="logs",
402
+ help=(
403
+ "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
404
+ " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
405
+ ),
406
+ )
407
+ parser.add_argument(
408
+ "--mixed_precision",
409
+ type=str,
410
+ default=None,
411
+ choices=["no", "fp16", "bf16"],
412
+ help=(
413
+ "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
414
+ " 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the"
415
+ " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
416
+ ),
417
+ )
418
+ parser.add_argument(
419
+ "--local_rank",
420
+ type=int,
421
+ default=-1,
422
+ help="For distributed training: local_rank",
423
+ )
424
+ parser.add_argument(
425
+ "--resume_unet",
426
+ type=str,
427
+ default=None,
428
+ help=("File path for unet lora to resume training."),
429
+ )
430
+ parser.add_argument(
431
+ "--resume_text_encoder",
432
+ type=str,
433
+ default=None,
434
+ help=("File path for text encoder lora to resume training."),
435
+ )
436
+ parser.add_argument(
437
+ "--resize",
438
+ type=bool,
439
+ default=True,
440
+ required=False,
441
+ help="Should images be resized to --resolution before training?",
442
+ )
443
+ parser.add_argument(
444
+ "--use_xformers", action="store_true", help="Whether or not to use xformers"
445
+ )
446
+
447
+ if input_args is not None:
448
+ args = parser.parse_args(input_args)
449
+ else:
450
+ args = parser.parse_args()
451
+
452
+ env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
453
+ if env_local_rank != -1 and env_local_rank != args.local_rank:
454
+ args.local_rank = env_local_rank
455
+
456
+ if args.with_prior_preservation:
457
+ if args.class_data_dir is None:
458
+ raise ValueError("You must specify a data directory for class images.")
459
+ if args.class_prompt is None:
460
+ raise ValueError("You must specify prompt for class images.")
461
+ else:
462
+ if args.class_data_dir is not None:
463
+ logger.warning(
464
+ "You need not use --class_data_dir without --with_prior_preservation."
465
+ )
466
+ if args.class_prompt is not None:
467
+ logger.warning(
468
+ "You need not use --class_prompt without --with_prior_preservation."
469
+ )
470
+
471
+ if not safetensors_available:
472
+ if args.output_format == "both":
473
+ print(
474
+ "Safetensors is not available - changing output format to just output PyTorch files"
475
+ )
476
+ args.output_format = "pt"
477
+ elif args.output_format == "safe":
478
+ raise ValueError(
479
+ "Safetensors is not available - either install it, or change output_format."
480
+ )
481
+
482
+ return args
483
+
484
+
485
+ def main(args):
486
+ logging_dir = Path(args.output_dir, args.logging_dir)
487
+
488
+ accelerator = Accelerator(
489
+ gradient_accumulation_steps=args.gradient_accumulation_steps,
490
+ mixed_precision=args.mixed_precision,
491
+ log_with="tensorboard",
492
+ project_dir=logging_dir,
493
+ )
494
+
495
+ # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
496
+ # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
497
+ # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
498
+ if (
499
+ args.train_text_encoder
500
+ and args.gradient_accumulation_steps > 1
501
+ and accelerator.num_processes > 1
502
+ ):
503
+ raise ValueError(
504
+ "Gradient accumulation is not supported when training the text encoder in distributed training. "
505
+ "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
506
+ )
507
+
508
+ if args.seed is not None:
509
+ set_seed(args.seed)
510
+
511
+ if args.with_prior_preservation:
512
+ class_images_dir = Path(args.class_data_dir)
513
+ if not class_images_dir.exists():
514
+ class_images_dir.mkdir(parents=True)
515
+ cur_class_images = len(list(class_images_dir.iterdir()))
516
+
517
+ if cur_class_images < args.num_class_images:
518
+ torch_dtype = (
519
+ torch.float16 if accelerator.device.type == "cuda" else torch.float32
520
+ )
521
+ pipeline = StableDiffusionPipeline.from_pretrained(
522
+ args.pretrained_model_name_or_path,
523
+ torch_dtype=torch_dtype,
524
+ safety_checker=None,
525
+ revision=args.revision,
526
+ )
527
+ pipeline.set_progress_bar_config(disable=True)
528
+
529
+ num_new_images = args.num_class_images - cur_class_images
530
+ logger.info(f"Number of class images to sample: {num_new_images}.")
531
+
532
+ sample_dataset = PromptDataset(args.class_prompt, num_new_images)
533
+ sample_dataloader = torch.utils.data.DataLoader(
534
+ sample_dataset, batch_size=args.sample_batch_size
535
+ )
536
+
537
+ sample_dataloader = accelerator.prepare(sample_dataloader)
538
+ pipeline.to(accelerator.device)
539
+
540
+ for example in tqdm(
541
+ sample_dataloader,
542
+ desc="Generating class images",
543
+ disable=not accelerator.is_local_main_process,
544
+ ):
545
+ images = pipeline(example["prompt"]).images
546
+
547
+ for i, image in enumerate(images):
548
+ hash_image = hashlib.sha1(image.tobytes()).hexdigest()
549
+ image_filename = (
550
+ class_images_dir
551
+ / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
552
+ )
553
+ image.save(image_filename)
554
+
555
+ del pipeline
556
+ if torch.cuda.is_available():
557
+ torch.cuda.empty_cache()
558
+
559
+ # Handle the repository creation
560
+ if accelerator.is_main_process:
561
+
562
+ if args.output_dir is not None:
563
+ os.makedirs(args.output_dir, exist_ok=True)
564
+
565
+ # Load the tokenizer
566
+ if args.tokenizer_name:
567
+ tokenizer = CLIPTokenizer.from_pretrained(
568
+ args.tokenizer_name,
569
+ revision=args.revision,
570
+ )
571
+ elif args.pretrained_model_name_or_path:
572
+ tokenizer = CLIPTokenizer.from_pretrained(
573
+ args.pretrained_model_name_or_path,
574
+ subfolder="tokenizer",
575
+ revision=args.revision,
576
+ )
577
+
578
+ # Load models and create wrapper for stable diffusion
579
+ text_encoder = CLIPTextModel.from_pretrained(
580
+ args.pretrained_model_name_or_path,
581
+ subfolder="text_encoder",
582
+ revision=args.revision,
583
+ )
584
+ vae = AutoencoderKL.from_pretrained(
585
+ args.pretrained_vae_name_or_path or args.pretrained_model_name_or_path,
586
+ subfolder=None if args.pretrained_vae_name_or_path else "vae",
587
+ revision=None if args.pretrained_vae_name_or_path else args.revision,
588
+ )
589
+ unet = UNet2DConditionModel.from_pretrained(
590
+ args.pretrained_model_name_or_path,
591
+ subfolder="unet",
592
+ revision=args.revision,
593
+ )
594
+ unet.requires_grad_(False)
595
+ unet_lora_params, _ = inject_trainable_lora(
596
+ unet, r=args.lora_rank, loras=args.resume_unet
597
+ )
598
+
599
+ for _up, _down in extract_lora_ups_down(unet):
600
+ print("Before training: Unet First Layer lora up", _up.weight.data)
601
+ print("Before training: Unet First Layer lora down", _down.weight.data)
602
+ break
603
+
604
+ vae.requires_grad_(False)
605
+ text_encoder.requires_grad_(False)
606
+
607
+ if args.train_text_encoder:
608
+ text_encoder_lora_params, _ = inject_trainable_lora(
609
+ text_encoder,
610
+ target_replace_module=["CLIPAttention"],
611
+ r=args.lora_rank,
612
+ )
613
+ for _up, _down in extract_lora_ups_down(
614
+ text_encoder, target_replace_module=["CLIPAttention"]
615
+ ):
616
+ print("Before training: text encoder First Layer lora up", _up.weight.data)
617
+ print(
618
+ "Before training: text encoder First Layer lora down", _down.weight.data
619
+ )
620
+ break
621
+
622
+ if args.use_xformers:
623
+ set_use_memory_efficient_attention_xformers(unet, True)
624
+ set_use_memory_efficient_attention_xformers(vae, True)
625
+
626
+ if args.gradient_checkpointing:
627
+ unet.enable_gradient_checkpointing()
628
+ if args.train_text_encoder:
629
+ text_encoder.gradient_checkpointing_enable()
630
+
631
+ if args.scale_lr:
632
+ args.learning_rate = (
633
+ args.learning_rate
634
+ * args.gradient_accumulation_steps
635
+ * args.train_batch_size
636
+ * accelerator.num_processes
637
+ )
638
+
639
+ # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
640
+ if args.use_8bit_adam:
641
+ try:
642
+ import bitsandbytes as bnb
643
+ except ImportError:
644
+ raise ImportError(
645
+ "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
646
+ )
647
+
648
+ optimizer_class = bnb.optim.AdamW8bit
649
+ else:
650
+ optimizer_class = torch.optim.AdamW
651
+
652
+ text_lr = (
653
+ args.learning_rate
654
+ if args.learning_rate_text is None
655
+ else args.learning_rate_text
656
+ )
657
+
658
+ params_to_optimize = (
659
+ [
660
+ {"params": itertools.chain(*unet_lora_params), "lr": args.learning_rate},
661
+ {
662
+ "params": itertools.chain(*text_encoder_lora_params),
663
+ "lr": text_lr,
664
+ },
665
+ ]
666
+ if args.train_text_encoder
667
+ else itertools.chain(*unet_lora_params)
668
+ )
669
+ optimizer = optimizer_class(
670
+ params_to_optimize,
671
+ lr=args.learning_rate,
672
+ betas=(args.adam_beta1, args.adam_beta2),
673
+ weight_decay=args.adam_weight_decay,
674
+ eps=args.adam_epsilon,
675
+ )
676
+
677
+ noise_scheduler = DDPMScheduler.from_config(
678
+ args.pretrained_model_name_or_path, subfolder="scheduler"
679
+ )
680
+
681
+ train_dataset = DreamBoothDataset(
682
+ instance_data_root=args.instance_data_dir,
683
+ instance_prompt=args.instance_prompt,
684
+ class_data_root=args.class_data_dir if args.with_prior_preservation else None,
685
+ class_prompt=args.class_prompt,
686
+ tokenizer=tokenizer,
687
+ size=args.resolution,
688
+ center_crop=args.center_crop,
689
+ color_jitter=args.color_jitter,
690
+ resize=args.resize,
691
+ )
692
+
693
+ def collate_fn(examples):
694
+ input_ids = [example["instance_prompt_ids"] for example in examples]
695
+ pixel_values = [example["instance_images"] for example in examples]
696
+
697
+ # Concat class and instance examples for prior preservation.
698
+ # We do this to avoid doing two forward passes.
699
+ if args.with_prior_preservation:
700
+ input_ids += [example["class_prompt_ids"] for example in examples]
701
+ pixel_values += [example["class_images"] for example in examples]
702
+
703
+ pixel_values = torch.stack(pixel_values)
704
+ pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
705
+
706
+ input_ids = tokenizer.pad(
707
+ {"input_ids": input_ids},
708
+ padding="max_length",
709
+ max_length=tokenizer.model_max_length,
710
+ return_tensors="pt",
711
+ ).input_ids
712
+
713
+ batch = {
714
+ "input_ids": input_ids,
715
+ "pixel_values": pixel_values,
716
+ }
717
+ return batch
718
+
719
+ train_dataloader = torch.utils.data.DataLoader(
720
+ train_dataset,
721
+ batch_size=args.train_batch_size,
722
+ shuffle=True,
723
+ collate_fn=collate_fn,
724
+ num_workers=0,
725
+ )
726
+
727
+ # Scheduler and math around the number of training steps.
728
+ overrode_max_train_steps = False
729
+ num_update_steps_per_epoch = math.ceil(
730
+ len(train_dataloader) / args.gradient_accumulation_steps
731
+ )
732
+ if args.max_train_steps is None:
733
+ args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
734
+ overrode_max_train_steps = True
735
+
736
+ lr_scheduler = get_scheduler(
737
+ args.lr_scheduler,
738
+ optimizer=optimizer,
739
+ num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
740
+ num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
741
+ )
742
+
743
+ if args.train_text_encoder:
744
+ (
745
+ unet,
746
+ text_encoder,
747
+ optimizer,
748
+ train_dataloader,
749
+ lr_scheduler,
750
+ ) = accelerator.prepare(
751
+ unet, text_encoder, optimizer, train_dataloader, lr_scheduler
752
+ )
753
+ else:
754
+ unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
755
+ unet, optimizer, train_dataloader, lr_scheduler
756
+ )
757
+
758
+ weight_dtype = torch.float32
759
+ if accelerator.mixed_precision == "fp16":
760
+ weight_dtype = torch.float16
761
+ elif accelerator.mixed_precision == "bf16":
762
+ weight_dtype = torch.bfloat16
763
+
764
+ # Move text_encode and vae to gpu.
765
+ # For mixed precision training we cast the text_encoder and vae weights to half-precision
766
+ # as these models are only used for inference, keeping weights in full precision is not required.
767
+ vae.to(accelerator.device, dtype=weight_dtype)
768
+ if not args.train_text_encoder:
769
+ text_encoder.to(accelerator.device, dtype=weight_dtype)
770
+
771
+ # We need to recalculate our total training steps as the size of the training dataloader may have changed.
772
+ num_update_steps_per_epoch = math.ceil(
773
+ len(train_dataloader) / args.gradient_accumulation_steps
774
+ )
775
+ if overrode_max_train_steps:
776
+ args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
777
+ # Afterwards we recalculate our number of training epochs
778
+ args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
779
+
780
+ # We need to initialize the trackers we use, and also store our configuration.
781
+ # The trackers initializes automatically on the main process.
782
+ if accelerator.is_main_process:
783
+ accelerator.init_trackers("dreambooth", config=vars(args))
784
+
785
+ # Train!
786
+ total_batch_size = (
787
+ args.train_batch_size
788
+ * accelerator.num_processes
789
+ * args.gradient_accumulation_steps
790
+ )
791
+
792
+ print("***** Running training *****")
793
+ print(f" Num examples = {len(train_dataset)}")
794
+ print(f" Num batches each epoch = {len(train_dataloader)}")
795
+ print(f" Num Epochs = {args.num_train_epochs}")
796
+ print(f" Instantaneous batch size per device = {args.train_batch_size}")
797
+ print(
798
+ f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
799
+ )
800
+ print(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
801
+ print(f" Total optimization steps = {args.max_train_steps}")
802
+ # Only show the progress bar once on each machine.
803
+ progress_bar = tqdm(
804
+ range(args.max_train_steps), disable=not accelerator.is_local_main_process
805
+ )
806
+ progress_bar.set_description("Steps")
807
+ global_step = 0
808
+ last_save = 0
809
+
810
+ for epoch in range(args.num_train_epochs):
811
+ unet.train()
812
+ if args.train_text_encoder:
813
+ text_encoder.train()
814
+
815
+ for step, batch in enumerate(train_dataloader):
816
+ # Convert images to latent space
817
+ latents = vae.encode(
818
+ batch["pixel_values"].to(dtype=weight_dtype)
819
+ ).latent_dist.sample()
820
+ latents = latents * 0.18215
821
+
822
+ # Sample noise that we'll add to the latents
823
+ noise = torch.randn_like(latents)
824
+ bsz = latents.shape[0]
825
+ # Sample a random timestep for each image
826
+ timesteps = torch.randint(
827
+ 0,
828
+ noise_scheduler.config.num_train_timesteps,
829
+ (bsz,),
830
+ device=latents.device,
831
+ )
832
+ timesteps = timesteps.long()
833
+
834
+ # Add noise to the latents according to the noise magnitude at each timestep
835
+ # (this is the forward diffusion process)
836
+ noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
837
+
838
+ # Get the text embedding for conditioning
839
+ encoder_hidden_states = text_encoder(batch["input_ids"])[0]
840
+
841
+ # Predict the noise residual
842
+ model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
843
+
844
+ # Get the target for loss depending on the prediction type
845
+ if noise_scheduler.config.prediction_type == "epsilon":
846
+ target = noise
847
+ elif noise_scheduler.config.prediction_type == "v_prediction":
848
+ target = noise_scheduler.get_velocity(latents, noise, timesteps)
849
+ else:
850
+ raise ValueError(
851
+ f"Unknown prediction type {noise_scheduler.config.prediction_type}"
852
+ )
853
+
854
+ if args.with_prior_preservation:
855
+ # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
856
+ model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
857
+ target, target_prior = torch.chunk(target, 2, dim=0)
858
+
859
+ # Compute instance loss
860
+ loss = (
861
+ F.mse_loss(model_pred.float(), target.float(), reduction="none")
862
+ .mean([1, 2, 3])
863
+ .mean()
864
+ )
865
+
866
+ # Compute prior loss
867
+ prior_loss = F.mse_loss(
868
+ model_pred_prior.float(), target_prior.float(), reduction="mean"
869
+ )
870
+
871
+ # Add the prior loss to the instance loss.
872
+ loss = loss + args.prior_loss_weight * prior_loss
873
+ else:
874
+ loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
875
+
876
+ accelerator.backward(loss)
877
+ if accelerator.sync_gradients:
878
+ params_to_clip = (
879
+ itertools.chain(unet.parameters(), text_encoder.parameters())
880
+ if args.train_text_encoder
881
+ else unet.parameters()
882
+ )
883
+ accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
884
+ optimizer.step()
885
+ lr_scheduler.step()
886
+ progress_bar.update(1)
887
+ optimizer.zero_grad()
888
+
889
+ global_step += 1
890
+
891
+ # Checks if the accelerator has performed an optimization step behind the scenes
892
+ if accelerator.sync_gradients:
893
+ if args.save_steps and global_step - last_save >= args.save_steps:
894
+ if accelerator.is_main_process:
895
+ # newer versions of accelerate allow the 'keep_fp32_wrapper' arg. without passing
896
+ # it, the models will be unwrapped, and when they are then used for further training,
897
+ # we will crash. pass this, but only to newer versions of accelerate. fixes
898
+ # https://github.com/huggingface/diffusers/issues/1566
899
+ accepts_keep_fp32_wrapper = "keep_fp32_wrapper" in set(
900
+ inspect.signature(
901
+ accelerator.unwrap_model
902
+ ).parameters.keys()
903
+ )
904
+ extra_args = (
905
+ {"keep_fp32_wrapper": True}
906
+ if accepts_keep_fp32_wrapper
907
+ else {}
908
+ )
909
+ pipeline = StableDiffusionPipeline.from_pretrained(
910
+ args.pretrained_model_name_or_path,
911
+ unet=accelerator.unwrap_model(unet, **extra_args),
912
+ text_encoder=accelerator.unwrap_model(
913
+ text_encoder, **extra_args
914
+ ),
915
+ revision=args.revision,
916
+ )
917
+
918
+ filename_unet = (
919
+ f"{args.output_dir}/lora_weight_e{epoch}_s{global_step}.pt"
920
+ )
921
+ filename_text_encoder = f"{args.output_dir}/lora_weight_e{epoch}_s{global_step}.text_encoder.pt"
922
+ print(f"save weights {filename_unet}, {filename_text_encoder}")
923
+ save_lora_weight(pipeline.unet, filename_unet)
924
+ if args.train_text_encoder:
925
+ save_lora_weight(
926
+ pipeline.text_encoder,
927
+ filename_text_encoder,
928
+ target_replace_module=["CLIPAttention"],
929
+ )
930
+
931
+ for _up, _down in extract_lora_ups_down(pipeline.unet):
932
+ print(
933
+ "First Unet Layer's Up Weight is now : ",
934
+ _up.weight.data,
935
+ )
936
+ print(
937
+ "First Unet Layer's Down Weight is now : ",
938
+ _down.weight.data,
939
+ )
940
+ break
941
+ if args.train_text_encoder:
942
+ for _up, _down in extract_lora_ups_down(
943
+ pipeline.text_encoder,
944
+ target_replace_module=["CLIPAttention"],
945
+ ):
946
+ print(
947
+ "First Text Encoder Layer's Up Weight is now : ",
948
+ _up.weight.data,
949
+ )
950
+ print(
951
+ "First Text Encoder Layer's Down Weight is now : ",
952
+ _down.weight.data,
953
+ )
954
+ break
955
+
956
+ last_save = global_step
957
+
958
+ logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
959
+ progress_bar.set_postfix(**logs)
960
+ accelerator.log(logs, step=global_step)
961
+
962
+ if global_step >= args.max_train_steps:
963
+ break
964
+
965
+ accelerator.wait_for_everyone()
966
+
967
+ # Create the pipeline using using the trained modules and save it.
968
+ if accelerator.is_main_process:
969
+ pipeline = StableDiffusionPipeline.from_pretrained(
970
+ args.pretrained_model_name_or_path,
971
+ unet=accelerator.unwrap_model(unet),
972
+ text_encoder=accelerator.unwrap_model(text_encoder),
973
+ revision=args.revision,
974
+ )
975
+
976
+ print("\n\nLora TRAINING DONE!\n\n")
977
+
978
+ if args.output_format == "pt" or args.output_format == "both":
979
+ save_lora_weight(pipeline.unet, args.output_dir + "/lora_weight.pt")
980
+ if args.train_text_encoder:
981
+ save_lora_weight(
982
+ pipeline.text_encoder,
983
+ args.output_dir + "/lora_weight.text_encoder.pt",
984
+ target_replace_module=["CLIPAttention"],
985
+ )
986
+
987
+ if args.output_format == "safe" or args.output_format == "both":
988
+ loras = {}
989
+ loras["unet"] = (pipeline.unet, {"CrossAttention", "Attention", "GEGLU"})
990
+ if args.train_text_encoder:
991
+ loras["text_encoder"] = (pipeline.text_encoder, {"CLIPAttention"})
992
+
993
+ save_safeloras(loras, args.output_dir + "/lora_weight.safetensors")
994
+
995
+ if args.push_to_hub:
996
+ repo.push_to_hub(
997
+ commit_message="End of training",
998
+ blocking=False,
999
+ auto_lfs_prune=True,
1000
+ )
1001
+
1002
+ accelerator.end_training()
1003
+
1004
+
1005
+ if __name__ == "__main__":
1006
+ args = parse_args()
1007
+ main(args)
ldm/configs/karlo/decoder_900M_vit_l.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ type: t2i-decoder
3
+ diffusion_sampler: uniform
4
+ hparams:
5
+ image_size: 64
6
+ num_channels: 320
7
+ num_res_blocks: 3
8
+ channel_mult: ''
9
+ attention_resolutions: 32,16,8
10
+ num_heads: -1
11
+ num_head_channels: 64
12
+ num_heads_upsample: -1
13
+ use_scale_shift_norm: true
14
+ dropout: 0.1
15
+ clip_dim: 768
16
+ clip_emb_mult: 4
17
+ text_ctx: 77
18
+ xf_width: 1536
19
+ xf_layers: 0
20
+ xf_heads: 0
21
+ xf_final_ln: false
22
+ resblock_updown: true
23
+ learn_sigma: true
24
+ text_drop: 0.3
25
+ clip_emb_type: image
26
+ clip_emb_drop: 0.1
27
+ use_plm: true
28
+
29
+ diffusion:
30
+ steps: 1000
31
+ learn_sigma: true
32
+ sigma_small: false
33
+ noise_schedule: squaredcos_cap_v2
34
+ use_kl: false
35
+ predict_xstart: false
36
+ rescale_learned_sigmas: true
37
+ timestep_respacing: ''
ldm/configs/karlo/improved_sr_64_256_1.4B.yaml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ type: improved_sr_64_256
3
+ diffusion_sampler: uniform
4
+ hparams:
5
+ channels: 320
6
+ depth: 3
7
+ channels_multiple:
8
+ - 1
9
+ - 2
10
+ - 3
11
+ - 4
12
+ dropout: 0.0
13
+
14
+ diffusion:
15
+ steps: 1000
16
+ learn_sigma: false
17
+ sigma_small: true
18
+ noise_schedule: squaredcos_cap_v2
19
+ use_kl: false
20
+ predict_xstart: false
21
+ rescale_learned_sigmas: true
22
+ timestep_respacing: '7'
23
+
24
+
25
+ sampling:
26
+ timestep_respacing: '7' # fix
27
+ clip_denoise: true
ldm/configs/karlo/prior_1B_vit_l.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ type: prior
3
+ diffusion_sampler: uniform
4
+ hparams:
5
+ text_ctx: 77
6
+ xf_width: 2048
7
+ xf_layers: 20
8
+ xf_heads: 32
9
+ xf_final_ln: true
10
+ text_drop: 0.2
11
+ clip_dim: 768
12
+
13
+ diffusion:
14
+ steps: 1000
15
+ learn_sigma: false
16
+ sigma_small: true
17
+ noise_schedule: squaredcos_cap_v2
18
+ use_kl: false
19
+ predict_xstart: true
20
+ rescale_learned_sigmas: false
21
+ timestep_respacing: ''
ldm/configs/stable-diffusion/intel/v2-inference-bf16.yaml ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2022 Intel Corporation
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ model:
5
+ base_learning_rate: 1.0e-4
6
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
7
+ params:
8
+ linear_start: 0.00085
9
+ linear_end: 0.0120
10
+ num_timesteps_cond: 1
11
+ log_every_t: 200
12
+ timesteps: 1000
13
+ first_stage_key: "jpg"
14
+ cond_stage_key: "txt"
15
+ image_size: 64
16
+ channels: 4
17
+ cond_stage_trainable: false
18
+ conditioning_key: crossattn
19
+ monitor: val/loss_simple_ema
20
+ scale_factor: 0.18215
21
+ use_ema: False # we set this to false because this is an inference only config
22
+
23
+ unet_config:
24
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
25
+ params:
26
+ use_checkpoint: False
27
+ use_fp16: False
28
+ use_bf16: True
29
+ image_size: 32 # unused
30
+ in_channels: 4
31
+ out_channels: 4
32
+ model_channels: 320
33
+ attention_resolutions: [ 4, 2, 1 ]
34
+ num_res_blocks: 2
35
+ channel_mult: [ 1, 2, 4, 4 ]
36
+ num_head_channels: 64 # need to fix for flash-attn
37
+ use_spatial_transformer: True
38
+ use_linear_in_transformer: True
39
+ transformer_depth: 1
40
+ context_dim: 1024
41
+ legacy: False
42
+
43
+ first_stage_config:
44
+ target: ldm.models.autoencoder.AutoencoderKL
45
+ params:
46
+ embed_dim: 4
47
+ monitor: val/rec_loss
48
+ ddconfig:
49
+ #attn_type: "vanilla-xformers"
50
+ double_z: true
51
+ z_channels: 4
52
+ resolution: 256
53
+ in_channels: 3
54
+ out_ch: 3
55
+ ch: 128
56
+ ch_mult:
57
+ - 1
58
+ - 2
59
+ - 4
60
+ - 4
61
+ num_res_blocks: 2
62
+ attn_resolutions: []
63
+ dropout: 0.0
64
+ lossconfig:
65
+ target: torch.nn.Identity
66
+
67
+ cond_stage_config:
68
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
69
+ params:
70
+ freeze: True
71
+ layer: "penultimate"
ldm/configs/stable-diffusion/intel/v2-inference-fp32.yaml ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2022 Intel Corporation
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ model:
5
+ base_learning_rate: 1.0e-4
6
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
7
+ params:
8
+ linear_start: 0.00085
9
+ linear_end: 0.0120
10
+ num_timesteps_cond: 1
11
+ log_every_t: 200
12
+ timesteps: 1000
13
+ first_stage_key: "jpg"
14
+ cond_stage_key: "txt"
15
+ image_size: 64
16
+ channels: 4
17
+ cond_stage_trainable: false
18
+ conditioning_key: crossattn
19
+ monitor: val/loss_simple_ema
20
+ scale_factor: 0.18215
21
+ use_ema: False # we set this to false because this is an inference only config
22
+
23
+ unet_config:
24
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
25
+ params:
26
+ use_checkpoint: False
27
+ use_fp16: False
28
+ image_size: 32 # unused
29
+ in_channels: 4
30
+ out_channels: 4
31
+ model_channels: 320
32
+ attention_resolutions: [ 4, 2, 1 ]
33
+ num_res_blocks: 2
34
+ channel_mult: [ 1, 2, 4, 4 ]
35
+ num_head_channels: 64 # need to fix for flash-attn
36
+ use_spatial_transformer: True
37
+ use_linear_in_transformer: True
38
+ transformer_depth: 1
39
+ context_dim: 1024
40
+ legacy: False
41
+
42
+ first_stage_config:
43
+ target: ldm.models.autoencoder.AutoencoderKL
44
+ params:
45
+ embed_dim: 4
46
+ monitor: val/rec_loss
47
+ ddconfig:
48
+ #attn_type: "vanilla-xformers"
49
+ double_z: true
50
+ z_channels: 4
51
+ resolution: 256
52
+ in_channels: 3
53
+ out_ch: 3
54
+ ch: 128
55
+ ch_mult:
56
+ - 1
57
+ - 2
58
+ - 4
59
+ - 4
60
+ num_res_blocks: 2
61
+ attn_resolutions: []
62
+ dropout: 0.0
63
+ lossconfig:
64
+ target: torch.nn.Identity
65
+
66
+ cond_stage_config:
67
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
68
+ params:
69
+ freeze: True
70
+ layer: "penultimate"
ldm/configs/stable-diffusion/intel/v2-inference-v-bf16.yaml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2022 Intel Corporation
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ model:
5
+ base_learning_rate: 1.0e-4
6
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
7
+ params:
8
+ parameterization: "v"
9
+ linear_start: 0.00085
10
+ linear_end: 0.0120
11
+ num_timesteps_cond: 1
12
+ log_every_t: 200
13
+ timesteps: 1000
14
+ first_stage_key: "jpg"
15
+ cond_stage_key: "txt"
16
+ image_size: 64
17
+ channels: 4
18
+ cond_stage_trainable: false
19
+ conditioning_key: crossattn
20
+ monitor: val/loss_simple_ema
21
+ scale_factor: 0.18215
22
+ use_ema: False # we set this to false because this is an inference only config
23
+
24
+ unet_config:
25
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
26
+ params:
27
+ use_checkpoint: False
28
+ use_fp16: False
29
+ use_bf16: True
30
+ image_size: 32 # unused
31
+ in_channels: 4
32
+ out_channels: 4
33
+ model_channels: 320
34
+ attention_resolutions: [ 4, 2, 1 ]
35
+ num_res_blocks: 2
36
+ channel_mult: [ 1, 2, 4, 4 ]
37
+ num_head_channels: 64 # need to fix for flash-attn
38
+ use_spatial_transformer: True
39
+ use_linear_in_transformer: True
40
+ transformer_depth: 1
41
+ context_dim: 1024
42
+ legacy: False
43
+
44
+ first_stage_config:
45
+ target: ldm.models.autoencoder.AutoencoderKL
46
+ params:
47
+ embed_dim: 4
48
+ monitor: val/rec_loss
49
+ ddconfig:
50
+ #attn_type: "vanilla-xformers"
51
+ double_z: true
52
+ z_channels: 4
53
+ resolution: 256
54
+ in_channels: 3
55
+ out_ch: 3
56
+ ch: 128
57
+ ch_mult:
58
+ - 1
59
+ - 2
60
+ - 4
61
+ - 4
62
+ num_res_blocks: 2
63
+ attn_resolutions: []
64
+ dropout: 0.0
65
+ lossconfig:
66
+ target: torch.nn.Identity
67
+
68
+ cond_stage_config:
69
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
70
+ params:
71
+ freeze: True
72
+ layer: "penultimate"
ldm/configs/stable-diffusion/intel/v2-inference-v-fp32.yaml ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (C) 2022 Intel Corporation
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ model:
5
+ base_learning_rate: 1.0e-4
6
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
7
+ params:
8
+ parameterization: "v"
9
+ linear_start: 0.00085
10
+ linear_end: 0.0120
11
+ num_timesteps_cond: 1
12
+ log_every_t: 200
13
+ timesteps: 1000
14
+ first_stage_key: "jpg"
15
+ cond_stage_key: "txt"
16
+ image_size: 64
17
+ channels: 4
18
+ cond_stage_trainable: false
19
+ conditioning_key: crossattn
20
+ monitor: val/loss_simple_ema
21
+ scale_factor: 0.18215
22
+ use_ema: False # we set this to false because this is an inference only config
23
+
24
+ unet_config:
25
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
26
+ params:
27
+ use_checkpoint: False
28
+ use_fp16: False
29
+ image_size: 32 # unused
30
+ in_channels: 4
31
+ out_channels: 4
32
+ model_channels: 320
33
+ attention_resolutions: [ 4, 2, 1 ]
34
+ num_res_blocks: 2
35
+ channel_mult: [ 1, 2, 4, 4 ]
36
+ num_head_channels: 64 # need to fix for flash-attn
37
+ use_spatial_transformer: True
38
+ use_linear_in_transformer: True
39
+ transformer_depth: 1
40
+ context_dim: 1024
41
+ legacy: False
42
+
43
+ first_stage_config:
44
+ target: ldm.models.autoencoder.AutoencoderKL
45
+ params:
46
+ embed_dim: 4
47
+ monitor: val/rec_loss
48
+ ddconfig:
49
+ #attn_type: "vanilla-xformers"
50
+ double_z: true
51
+ z_channels: 4
52
+ resolution: 256
53
+ in_channels: 3
54
+ out_ch: 3
55
+ ch: 128
56
+ ch_mult:
57
+ - 1
58
+ - 2
59
+ - 4
60
+ - 4
61
+ num_res_blocks: 2
62
+ attn_resolutions: []
63
+ dropout: 0.0
64
+ lossconfig:
65
+ target: torch.nn.Identity
66
+
67
+ cond_stage_config:
68
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
69
+ params:
70
+ freeze: True
71
+ layer: "penultimate"
ldm/configs/stable-diffusion/v2-1-stable-unclip-h-inference.yaml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-04
3
+ target: ldm.models.diffusion.ddpm.ImageEmbeddingConditionedLatentDiffusion
4
+ params:
5
+ embedding_dropout: 0.25
6
+ parameterization: "v"
7
+ linear_start: 0.00085
8
+ linear_end: 0.0120
9
+ log_every_t: 200
10
+ timesteps: 1000
11
+ first_stage_key: "jpg"
12
+ cond_stage_key: "txt"
13
+ image_size: 96
14
+ channels: 4
15
+ cond_stage_trainable: false
16
+ conditioning_key: crossattn-adm
17
+ scale_factor: 0.18215
18
+ monitor: val/loss_simple_ema
19
+ use_ema: False
20
+
21
+ embedder_config:
22
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
23
+
24
+ noise_aug_config:
25
+ target: ldm.modules.encoders.modules.CLIPEmbeddingNoiseAugmentation
26
+ params:
27
+ timestep_dim: 1024
28
+ noise_schedule_config:
29
+ timesteps: 1000
30
+ beta_schedule: squaredcos_cap_v2
31
+
32
+ unet_config:
33
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
34
+ params:
35
+ num_classes: "sequential"
36
+ adm_in_channels: 2048
37
+ use_checkpoint: True
38
+ image_size: 32 # unused
39
+ in_channels: 4
40
+ out_channels: 4
41
+ model_channels: 320
42
+ attention_resolutions: [ 4, 2, 1 ]
43
+ num_res_blocks: 2
44
+ channel_mult: [ 1, 2, 4, 4 ]
45
+ num_head_channels: 64 # need to fix for flash-attn
46
+ use_spatial_transformer: True
47
+ use_linear_in_transformer: True
48
+ transformer_depth: 1
49
+ context_dim: 1024
50
+ legacy: False
51
+
52
+ first_stage_config:
53
+ target: ldm.models.autoencoder.AutoencoderKL
54
+ params:
55
+ embed_dim: 4
56
+ monitor: val/rec_loss
57
+ ddconfig:
58
+ attn_type: "vanilla-xformers"
59
+ double_z: true
60
+ z_channels: 4
61
+ resolution: 256
62
+ in_channels: 3
63
+ out_ch: 3
64
+ ch: 128
65
+ ch_mult:
66
+ - 1
67
+ - 2
68
+ - 4
69
+ - 4
70
+ num_res_blocks: 2
71
+ attn_resolutions: [ ]
72
+ dropout: 0.0
73
+ lossconfig:
74
+ target: torch.nn.Identity
75
+
76
+ cond_stage_config:
77
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
78
+ params:
79
+ freeze: True
80
+ layer: "penultimate"
ldm/configs/stable-diffusion/v2-1-stable-unclip-l-inference.yaml ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-04
3
+ target: ldm.models.diffusion.ddpm.ImageEmbeddingConditionedLatentDiffusion
4
+ params:
5
+ embedding_dropout: 0.25
6
+ parameterization: "v"
7
+ linear_start: 0.00085
8
+ linear_end: 0.0120
9
+ log_every_t: 200
10
+ timesteps: 1000
11
+ first_stage_key: "jpg"
12
+ cond_stage_key: "txt"
13
+ image_size: 96
14
+ channels: 4
15
+ cond_stage_trainable: false
16
+ conditioning_key: crossattn-adm
17
+ scale_factor: 0.18215
18
+ monitor: val/loss_simple_ema
19
+ use_ema: False
20
+
21
+ embedder_config:
22
+ target: ldm.modules.encoders.modules.ClipImageEmbedder
23
+ params:
24
+ model: "ViT-L/14"
25
+
26
+ noise_aug_config:
27
+ target: ldm.modules.encoders.modules.CLIPEmbeddingNoiseAugmentation
28
+ params:
29
+ clip_stats_path: "checkpoints/karlo_models/ViT-L-14_stats.th"
30
+ timestep_dim: 768
31
+ noise_schedule_config:
32
+ timesteps: 1000
33
+ beta_schedule: squaredcos_cap_v2
34
+
35
+ unet_config:
36
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
37
+ params:
38
+ num_classes: "sequential"
39
+ adm_in_channels: 1536
40
+ use_checkpoint: True
41
+ image_size: 32 # unused
42
+ in_channels: 4
43
+ out_channels: 4
44
+ model_channels: 320
45
+ attention_resolutions: [ 4, 2, 1 ]
46
+ num_res_blocks: 2
47
+ channel_mult: [ 1, 2, 4, 4 ]
48
+ num_head_channels: 64 # need to fix for flash-attn
49
+ use_spatial_transformer: True
50
+ use_linear_in_transformer: True
51
+ transformer_depth: 1
52
+ context_dim: 1024
53
+ legacy: False
54
+
55
+ first_stage_config:
56
+ target: ldm.models.autoencoder.AutoencoderKL
57
+ params:
58
+ embed_dim: 4
59
+ monitor: val/rec_loss
60
+ ddconfig:
61
+ attn_type: "vanilla-xformers"
62
+ double_z: true
63
+ z_channels: 4
64
+ resolution: 256
65
+ in_channels: 3
66
+ out_ch: 3
67
+ ch: 128
68
+ ch_mult:
69
+ - 1
70
+ - 2
71
+ - 4
72
+ - 4
73
+ num_res_blocks: 2
74
+ attn_resolutions: [ ]
75
+ dropout: 0.0
76
+ lossconfig:
77
+ target: torch.nn.Identity
78
+
79
+ cond_stage_config:
80
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
81
+ params:
82
+ freeze: True
83
+ layer: "penultimate"
ldm/configs/stable-diffusion/v2-inference-v.yaml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-4
3
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
4
+ params:
5
+ parameterization: "v"
6
+ linear_start: 0.00085
7
+ linear_end: 0.0120
8
+ num_timesteps_cond: 1
9
+ log_every_t: 200
10
+ timesteps: 1000
11
+ first_stage_key: "jpg"
12
+ cond_stage_key: "txt"
13
+ image_size: 64
14
+ channels: 4
15
+ cond_stage_trainable: false
16
+ conditioning_key: crossattn
17
+ monitor: val/loss_simple_ema
18
+ scale_factor: 0.18215
19
+ use_ema: False # we set this to false because this is an inference only config
20
+
21
+ unet_config:
22
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
23
+ params:
24
+ use_checkpoint: True
25
+ use_fp16: True
26
+ image_size: 32 # unused
27
+ in_channels: 4
28
+ out_channels: 4
29
+ model_channels: 320
30
+ attention_resolutions: [ 4, 2, 1 ]
31
+ num_res_blocks: 2
32
+ channel_mult: [ 1, 2, 4, 4 ]
33
+ num_head_channels: 64 # need to fix for flash-attn
34
+ use_spatial_transformer: True
35
+ use_linear_in_transformer: True
36
+ transformer_depth: 1
37
+ context_dim: 1024
38
+ legacy: False
39
+
40
+ first_stage_config:
41
+ target: ldm.models.autoencoder.AutoencoderKL
42
+ params:
43
+ embed_dim: 4
44
+ monitor: val/rec_loss
45
+ ddconfig:
46
+ #attn_type: "vanilla-xformers"
47
+ double_z: true
48
+ z_channels: 4
49
+ resolution: 256
50
+ in_channels: 3
51
+ out_ch: 3
52
+ ch: 128
53
+ ch_mult:
54
+ - 1
55
+ - 2
56
+ - 4
57
+ - 4
58
+ num_res_blocks: 2
59
+ attn_resolutions: []
60
+ dropout: 0.0
61
+ lossconfig:
62
+ target: torch.nn.Identity
63
+
64
+ cond_stage_config:
65
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
66
+ params:
67
+ freeze: True
68
+ layer: "penultimate"
ldm/configs/stable-diffusion/v2-inference.yaml ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-4
3
+ target: ldm.models.diffusion.ddpm.LatentDiffusion
4
+ params:
5
+ linear_start: 0.00085
6
+ linear_end: 0.0120
7
+ num_timesteps_cond: 1
8
+ log_every_t: 200
9
+ timesteps: 1000
10
+ first_stage_key: "jpg"
11
+ cond_stage_key: "txt"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: crossattn
16
+ monitor: val/loss_simple_ema
17
+ scale_factor: 0.18215
18
+ use_ema: False # we set this to false because this is an inference only config
19
+
20
+ unet_config:
21
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
22
+ params:
23
+ use_checkpoint: True
24
+ use_fp16: True
25
+ image_size: 32 # unused
26
+ in_channels: 4
27
+ out_channels: 4
28
+ model_channels: 320
29
+ attention_resolutions: [ 4, 2, 1 ]
30
+ num_res_blocks: 2
31
+ channel_mult: [ 1, 2, 4, 4 ]
32
+ num_head_channels: 64 # need to fix for flash-attn
33
+ use_spatial_transformer: True
34
+ use_linear_in_transformer: True
35
+ transformer_depth: 1
36
+ context_dim: 1024
37
+ legacy: False
38
+
39
+ first_stage_config:
40
+ target: ldm.models.autoencoder.AutoencoderKL
41
+ params:
42
+ embed_dim: 4
43
+ monitor: val/rec_loss
44
+ ddconfig:
45
+ #attn_type: "vanilla-xformers"
46
+ double_z: true
47
+ z_channels: 4
48
+ resolution: 256
49
+ in_channels: 3
50
+ out_ch: 3
51
+ ch: 128
52
+ ch_mult:
53
+ - 1
54
+ - 2
55
+ - 4
56
+ - 4
57
+ num_res_blocks: 2
58
+ attn_resolutions: []
59
+ dropout: 0.0
60
+ lossconfig:
61
+ target: torch.nn.Identity
62
+
63
+ cond_stage_config:
64
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
65
+ params:
66
+ freeze: True
67
+ layer: "penultimate"
ldm/configs/stable-diffusion/v2-inpainting-inference.yaml ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 5.0e-05
3
+ target: ldm.models.diffusion.ddpm.LatentInpaintDiffusion
4
+ params:
5
+ linear_start: 0.00085
6
+ linear_end: 0.0120
7
+ num_timesteps_cond: 1
8
+ log_every_t: 200
9
+ timesteps: 1000
10
+ first_stage_key: "jpg"
11
+ cond_stage_key: "txt"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: hybrid
16
+ scale_factor: 0.18215
17
+ monitor: val/loss_simple_ema
18
+ finetune_keys: null
19
+ use_ema: False
20
+
21
+ unet_config:
22
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
23
+ params:
24
+ use_checkpoint: True
25
+ image_size: 32 # unused
26
+ in_channels: 9
27
+ out_channels: 4
28
+ model_channels: 320
29
+ attention_resolutions: [ 4, 2, 1 ]
30
+ num_res_blocks: 2
31
+ channel_mult: [ 1, 2, 4, 4 ]
32
+ num_head_channels: 64 # need to fix for flash-attn
33
+ use_spatial_transformer: True
34
+ use_linear_in_transformer: True
35
+ transformer_depth: 1
36
+ context_dim: 1024
37
+ legacy: False
38
+
39
+ first_stage_config:
40
+ target: ldm.models.autoencoder.AutoencoderKL
41
+ params:
42
+ embed_dim: 4
43
+ monitor: val/rec_loss
44
+ ddconfig:
45
+ #attn_type: "vanilla-xformers"
46
+ double_z: true
47
+ z_channels: 4
48
+ resolution: 256
49
+ in_channels: 3
50
+ out_ch: 3
51
+ ch: 128
52
+ ch_mult:
53
+ - 1
54
+ - 2
55
+ - 4
56
+ - 4
57
+ num_res_blocks: 2
58
+ attn_resolutions: [ ]
59
+ dropout: 0.0
60
+ lossconfig:
61
+ target: torch.nn.Identity
62
+
63
+ cond_stage_config:
64
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
65
+ params:
66
+ freeze: True
67
+ layer: "penultimate"
68
+
69
+
70
+ data:
71
+ target: ldm.data.laion.WebDataModuleFromConfig
72
+ params:
73
+ tar_base: null # for concat as in LAION-A
74
+ p_unsafe_threshold: 0.1
75
+ filter_word_list: "data/filters.yaml"
76
+ max_pwatermark: 0.45
77
+ batch_size: 8
78
+ num_workers: 6
79
+ multinode: True
80
+ min_size: 512
81
+ train:
82
+ shards:
83
+ - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-0/{00000..18699}.tar -"
84
+ - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-1/{00000..18699}.tar -"
85
+ - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-2/{00000..18699}.tar -"
86
+ - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-3/{00000..18699}.tar -"
87
+ - "pipe:aws s3 cp s3://stability-aws/laion-a-native/part-4/{00000..18699}.tar -" #{00000-94333}.tar"
88
+ shuffle: 10000
89
+ image_key: jpg
90
+ image_transforms:
91
+ - target: torchvision.transforms.Resize
92
+ params:
93
+ size: 512
94
+ interpolation: 3
95
+ - target: torchvision.transforms.RandomCrop
96
+ params:
97
+ size: 512
98
+ postprocess:
99
+ target: ldm.data.laion.AddMask
100
+ params:
101
+ mode: "512train-large"
102
+ p_drop: 0.25
103
+ # NOTE use enough shards to avoid empty validation loops in workers
104
+ validation:
105
+ shards:
106
+ - "pipe:aws s3 cp s3://deep-floyd-s3/datasets/laion_cleaned-part5/{93001..94333}.tar - "
107
+ shuffle: 0
108
+ image_key: jpg
109
+ image_transforms:
110
+ - target: torchvision.transforms.Resize
111
+ params:
112
+ size: 512
113
+ interpolation: 3
114
+ - target: torchvision.transforms.CenterCrop
115
+ params:
116
+ size: 512
117
+ postprocess:
118
+ target: ldm.data.laion.AddMask
119
+ params:
120
+ mode: "512train-large"
121
+ p_drop: 0.25
122
+
123
+ lightning:
124
+ find_unused_parameters: True
125
+ modelcheckpoint:
126
+ params:
127
+ every_n_train_steps: 5000
128
+
129
+ callbacks:
130
+ metrics_over_trainsteps_checkpoint:
131
+ params:
132
+ every_n_train_steps: 10000
133
+
134
+ image_logger:
135
+ target: main.ImageLogger
136
+ params:
137
+ enable_autocast: False
138
+ disabled: False
139
+ batch_frequency: 1000
140
+ max_images: 4
141
+ increase_log_steps: False
142
+ log_first_step: False
143
+ log_images_kwargs:
144
+ use_ema_scope: False
145
+ inpaint: False
146
+ plot_progressive_rows: False
147
+ plot_diffusion_rows: False
148
+ N: 4
149
+ unconditional_guidance_scale: 5.0
150
+ unconditional_guidance_label: [""]
151
+ ddim_steps: 50 # todo check these out for depth2img,
152
+ ddim_eta: 0.0 # todo check these out for depth2img,
153
+
154
+ trainer:
155
+ benchmark: True
156
+ val_check_interval: 5000000
157
+ num_sanity_val_steps: 0
158
+ accumulate_grad_batches: 1
ldm/configs/stable-diffusion/v2-midas-inference.yaml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 5.0e-07
3
+ target: ldm.models.diffusion.ddpm.LatentDepth2ImageDiffusion
4
+ params:
5
+ linear_start: 0.00085
6
+ linear_end: 0.0120
7
+ num_timesteps_cond: 1
8
+ log_every_t: 200
9
+ timesteps: 1000
10
+ first_stage_key: "jpg"
11
+ cond_stage_key: "txt"
12
+ image_size: 64
13
+ channels: 4
14
+ cond_stage_trainable: false
15
+ conditioning_key: hybrid
16
+ scale_factor: 0.18215
17
+ monitor: val/loss_simple_ema
18
+ finetune_keys: null
19
+ use_ema: False
20
+
21
+ depth_stage_config:
22
+ target: ldm.modules.midas.api.MiDaSInference
23
+ params:
24
+ model_type: "dpt_hybrid"
25
+
26
+ unet_config:
27
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
28
+ params:
29
+ use_checkpoint: True
30
+ image_size: 32 # unused
31
+ in_channels: 5
32
+ out_channels: 4
33
+ model_channels: 320
34
+ attention_resolutions: [ 4, 2, 1 ]
35
+ num_res_blocks: 2
36
+ channel_mult: [ 1, 2, 4, 4 ]
37
+ num_head_channels: 64 # need to fix for flash-attn
38
+ use_spatial_transformer: True
39
+ use_linear_in_transformer: True
40
+ transformer_depth: 1
41
+ context_dim: 1024
42
+ legacy: False
43
+
44
+ first_stage_config:
45
+ target: ldm.models.autoencoder.AutoencoderKL
46
+ params:
47
+ embed_dim: 4
48
+ monitor: val/rec_loss
49
+ ddconfig:
50
+ #attn_type: "vanilla-xformers"
51
+ double_z: true
52
+ z_channels: 4
53
+ resolution: 256
54
+ in_channels: 3
55
+ out_ch: 3
56
+ ch: 128
57
+ ch_mult:
58
+ - 1
59
+ - 2
60
+ - 4
61
+ - 4
62
+ num_res_blocks: 2
63
+ attn_resolutions: [ ]
64
+ dropout: 0.0
65
+ lossconfig:
66
+ target: torch.nn.Identity
67
+
68
+ cond_stage_config:
69
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
70
+ params:
71
+ freeze: True
72
+ layer: "penultimate"
73
+
74
+
ldm/configs/stable-diffusion/x4-upscaling.yaml ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_learning_rate: 1.0e-04
3
+ target: ldm.models.diffusion.ddpm.LatentUpscaleDiffusion
4
+ params:
5
+ parameterization: "v"
6
+ low_scale_key: "lr"
7
+ linear_start: 0.0001
8
+ linear_end: 0.02
9
+ num_timesteps_cond: 1
10
+ log_every_t: 200
11
+ timesteps: 1000
12
+ first_stage_key: "jpg"
13
+ cond_stage_key: "txt"
14
+ image_size: 128
15
+ channels: 4
16
+ cond_stage_trainable: false
17
+ conditioning_key: "hybrid-adm"
18
+ monitor: val/loss_simple_ema
19
+ scale_factor: 0.08333
20
+ use_ema: False
21
+
22
+ low_scale_config:
23
+ target: ldm.modules.diffusionmodules.upscaling.ImageConcatWithNoiseAugmentation
24
+ params:
25
+ noise_schedule_config: # image space
26
+ linear_start: 0.0001
27
+ linear_end: 0.02
28
+ max_noise_level: 350
29
+
30
+ unet_config:
31
+ target: ldm.modules.diffusionmodules.openaimodel.UNetModel
32
+ params:
33
+ use_checkpoint: True
34
+ num_classes: 1000 # timesteps for noise conditioning (here constant, just need one)
35
+ image_size: 128
36
+ in_channels: 7
37
+ out_channels: 4
38
+ model_channels: 256
39
+ attention_resolutions: [ 2,4,8]
40
+ num_res_blocks: 2
41
+ channel_mult: [ 1, 2, 2, 4]
42
+ disable_self_attentions: [True, True, True, False]
43
+ disable_middle_self_attn: False
44
+ num_heads: 8
45
+ use_spatial_transformer: True
46
+ transformer_depth: 1
47
+ context_dim: 1024
48
+ legacy: False
49
+ use_linear_in_transformer: True
50
+
51
+ first_stage_config:
52
+ target: ldm.models.autoencoder.AutoencoderKL
53
+ params:
54
+ embed_dim: 4
55
+ ddconfig:
56
+ # attn_type: "vanilla-xformers" this model needs efficient attention to be feasible on HR data, also the decoder seems to break in half precision (UNet is fine though)
57
+ double_z: True
58
+ z_channels: 4
59
+ resolution: 256
60
+ in_channels: 3
61
+ out_ch: 3
62
+ ch: 128
63
+ ch_mult: [ 1,2,4 ] # num_down = len(ch_mult)-1
64
+ num_res_blocks: 2
65
+ attn_resolutions: [ ]
66
+ dropout: 0.0
67
+
68
+ lossconfig:
69
+ target: torch.nn.Identity
70
+
71
+ cond_stage_config:
72
+ target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
73
+ params:
74
+ freeze: True
75
+ layer: "penultimate"
76
+
ldm/data/__init__.py ADDED
File without changes
ldm/data/util.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from ldm.modules.midas.api import load_midas_transform
4
+
5
+
6
+ class AddMiDaS(object):
7
+ def __init__(self, model_type):
8
+ super().__init__()
9
+ self.transform = load_midas_transform(model_type)
10
+
11
+ def pt2np(self, x):
12
+ x = ((x + 1.0) * .5).detach().cpu().numpy()
13
+ return x
14
+
15
+ def np2pt(self, x):
16
+ x = torch.from_numpy(x) * 2 - 1.
17
+ return x
18
+
19
+ def __call__(self, sample):
20
+ # sample['jpg'] is tensor hwc in [-1, 1] at this point
21
+ x = self.pt2np(sample['jpg'])
22
+ x = self.transform({"image": x})["image"]
23
+ sample['midas_in'] = x
24
+ return sample
ldm/models/autoencoder.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import pytorch_lightning as pl
3
+ import torch.nn.functional as F
4
+ from contextlib import contextmanager
5
+
6
+ from ldm.modules.diffusionmodules.model import Encoder, Decoder
7
+ from ldm.modules.distributions.distributions import DiagonalGaussianDistribution
8
+
9
+ from ldm.util import instantiate_from_config
10
+ from ldm.modules.ema import LitEma
11
+
12
+
13
+ class AutoencoderKL(pl.LightningModule):
14
+ def __init__(self,
15
+ ddconfig,
16
+ lossconfig,
17
+ embed_dim,
18
+ ckpt_path=None,
19
+ ignore_keys=[],
20
+ image_key="image",
21
+ colorize_nlabels=None,
22
+ monitor=None,
23
+ ema_decay=None,
24
+ learn_logvar=False
25
+ ):
26
+ super().__init__()
27
+ self.learn_logvar = learn_logvar
28
+ self.image_key = image_key
29
+ self.encoder = Encoder(**ddconfig)
30
+ self.decoder = Decoder(**ddconfig)
31
+ self.loss = instantiate_from_config(lossconfig)
32
+ assert ddconfig["double_z"]
33
+ self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
34
+ self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
35
+ self.embed_dim = embed_dim
36
+ if colorize_nlabels is not None:
37
+ assert type(colorize_nlabels)==int
38
+ self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
39
+ if monitor is not None:
40
+ self.monitor = monitor
41
+
42
+ self.use_ema = ema_decay is not None
43
+ if self.use_ema:
44
+ self.ema_decay = ema_decay
45
+ assert 0. < ema_decay < 1.
46
+ self.model_ema = LitEma(self, decay=ema_decay)
47
+ print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
48
+
49
+ if ckpt_path is not None:
50
+ self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
51
+
52
+ def init_from_ckpt(self, path, ignore_keys=list()):
53
+ sd = torch.load(path, map_location="cpu")["state_dict"]
54
+ keys = list(sd.keys())
55
+ for k in keys:
56
+ for ik in ignore_keys:
57
+ if k.startswith(ik):
58
+ print("Deleting key {} from state_dict.".format(k))
59
+ del sd[k]
60
+ self.load_state_dict(sd, strict=False)
61
+ print(f"Restored from {path}")
62
+
63
+ @contextmanager
64
+ def ema_scope(self, context=None):
65
+ if self.use_ema:
66
+ self.model_ema.store(self.parameters())
67
+ self.model_ema.copy_to(self)
68
+ if context is not None:
69
+ print(f"{context}: Switched to EMA weights")
70
+ try:
71
+ yield None
72
+ finally:
73
+ if self.use_ema:
74
+ self.model_ema.restore(self.parameters())
75
+ if context is not None:
76
+ print(f"{context}: Restored training weights")
77
+
78
+ def on_train_batch_end(self, *args, **kwargs):
79
+ if self.use_ema:
80
+ self.model_ema(self)
81
+
82
+ def encode(self, x):
83
+ h = self.encoder(x)
84
+ moments = self.quant_conv(h)
85
+ posterior = DiagonalGaussianDistribution(moments)
86
+ return posterior
87
+
88
+ def decode(self, z):
89
+ z = self.post_quant_conv(z)
90
+ dec = self.decoder(z)
91
+ return dec
92
+
93
+ def forward(self, input, sample_posterior=True):
94
+ posterior = self.encode(input)
95
+ if sample_posterior:
96
+ z = posterior.sample()
97
+ else:
98
+ z = posterior.mode()
99
+ dec = self.decode(z)
100
+ return dec, posterior
101
+
102
+ def get_input(self, batch, k):
103
+ x = batch[k]
104
+ if len(x.shape) == 3:
105
+ x = x[..., None]
106
+ x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
107
+ return x
108
+
109
+ def training_step(self, batch, batch_idx, optimizer_idx):
110
+ inputs = self.get_input(batch, self.image_key)
111
+ reconstructions, posterior = self(inputs)
112
+
113
+ if optimizer_idx == 0:
114
+ # train encoder+decoder+logvar
115
+ aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
116
+ last_layer=self.get_last_layer(), split="train")
117
+ self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
118
+ self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
119
+ return aeloss
120
+
121
+ if optimizer_idx == 1:
122
+ # train the discriminator
123
+ discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
124
+ last_layer=self.get_last_layer(), split="train")
125
+
126
+ self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
127
+ self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
128
+ return discloss
129
+
130
+ def validation_step(self, batch, batch_idx):
131
+ log_dict = self._validation_step(batch, batch_idx)
132
+ with self.ema_scope():
133
+ log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema")
134
+ return log_dict
135
+
136
+ def _validation_step(self, batch, batch_idx, postfix=""):
137
+ inputs = self.get_input(batch, self.image_key)
138
+ reconstructions, posterior = self(inputs)
139
+ aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
140
+ last_layer=self.get_last_layer(), split="val"+postfix)
141
+
142
+ discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
143
+ last_layer=self.get_last_layer(), split="val"+postfix)
144
+
145
+ self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"])
146
+ self.log_dict(log_dict_ae)
147
+ self.log_dict(log_dict_disc)
148
+ return self.log_dict
149
+
150
+ def configure_optimizers(self):
151
+ lr = self.learning_rate
152
+ ae_params_list = list(self.encoder.parameters()) + list(self.decoder.parameters()) + list(
153
+ self.quant_conv.parameters()) + list(self.post_quant_conv.parameters())
154
+ if self.learn_logvar:
155
+ print(f"{self.__class__.__name__}: Learning logvar")
156
+ ae_params_list.append(self.loss.logvar)
157
+ opt_ae = torch.optim.Adam(ae_params_list,
158
+ lr=lr, betas=(0.5, 0.9))
159
+ opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
160
+ lr=lr, betas=(0.5, 0.9))
161
+ return [opt_ae, opt_disc], []
162
+
163
+ def get_last_layer(self):
164
+ return self.decoder.conv_out.weight
165
+
166
+ @torch.no_grad()
167
+ def log_images(self, batch, only_inputs=False, log_ema=False, **kwargs):
168
+ log = dict()
169
+ x = self.get_input(batch, self.image_key)
170
+ x = x.to(self.device)
171
+ if not only_inputs:
172
+ xrec, posterior = self(x)
173
+ if x.shape[1] > 3:
174
+ # colorize with random projection
175
+ assert xrec.shape[1] > 3
176
+ x = self.to_rgb(x)
177
+ xrec = self.to_rgb(xrec)
178
+ log["samples"] = self.decode(torch.randn_like(posterior.sample()))
179
+ log["reconstructions"] = xrec
180
+ if log_ema or self.use_ema:
181
+ with self.ema_scope():
182
+ xrec_ema, posterior_ema = self(x)
183
+ if x.shape[1] > 3:
184
+ # colorize with random projection
185
+ assert xrec_ema.shape[1] > 3
186
+ xrec_ema = self.to_rgb(xrec_ema)
187
+ log["samples_ema"] = self.decode(torch.randn_like(posterior_ema.sample()))
188
+ log["reconstructions_ema"] = xrec_ema
189
+ log["inputs"] = x
190
+ return log
191
+
192
+ def to_rgb(self, x):
193
+ assert self.image_key == "segmentation"
194
+ if not hasattr(self, "colorize"):
195
+ self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
196
+ x = F.conv2d(x, weight=self.colorize)
197
+ x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
198
+ return x
199
+
200
+
201
+ class IdentityFirstStage(torch.nn.Module):
202
+ def __init__(self, *args, vq_interface=False, **kwargs):
203
+ self.vq_interface = vq_interface
204
+ super().__init__()
205
+
206
+ def encode(self, x, *args, **kwargs):
207
+ return x
208
+
209
+ def decode(self, x, *args, **kwargs):
210
+ return x
211
+
212
+ def quantize(self, x, *args, **kwargs):
213
+ if self.vq_interface:
214
+ return x, None, [None, None, None]
215
+ return x
216
+
217
+ def forward(self, x, *args, **kwargs):
218
+ return x
219
+
ldm/models/diffusion/__init__.py ADDED
File without changes
ldm/models/diffusion/ddim.py ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SAMPLING ONLY."""
2
+
3
+ import torch
4
+ import numpy as np
5
+ from tqdm import tqdm
6
+
7
+ from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, extract_into_tensor
8
+
9
+
10
+ class DDIMSampler(object):
11
+ def __init__(self, model, schedule="linear", device=torch.device("cuda"), **kwargs):
12
+ super().__init__()
13
+ self.model = model
14
+ self.ddpm_num_timesteps = model.num_timesteps
15
+ self.schedule = schedule
16
+ self.device = device
17
+
18
+ def register_buffer(self, name, attr):
19
+ if type(attr) == torch.Tensor:
20
+ if attr.device != self.device:
21
+ attr = attr.to(self.device)
22
+ setattr(self, name, attr)
23
+
24
+ def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
25
+ self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
26
+ num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
27
+ alphas_cumprod = self.model.alphas_cumprod
28
+ assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
29
+ to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
30
+
31
+ self.register_buffer('betas', to_torch(self.model.betas))
32
+ self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
33
+ self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
34
+
35
+ # calculations for diffusion q(x_t | x_{t-1}) and others
36
+ self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
37
+ self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
38
+ self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
39
+ self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
40
+ self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
41
+
42
+ # ddim sampling parameters
43
+ ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
44
+ ddim_timesteps=self.ddim_timesteps,
45
+ eta=ddim_eta,verbose=verbose)
46
+ self.register_buffer('ddim_sigmas', ddim_sigmas)
47
+ self.register_buffer('ddim_alphas', ddim_alphas)
48
+ self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
49
+ self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
50
+ sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
51
+ (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
52
+ 1 - self.alphas_cumprod / self.alphas_cumprod_prev))
53
+ self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
54
+
55
+ @torch.no_grad()
56
+ def sample(self,
57
+ S,
58
+ batch_size,
59
+ shape,
60
+ conditioning=None,
61
+ callback=None,
62
+ normals_sequence=None,
63
+ img_callback=None,
64
+ quantize_x0=False,
65
+ eta=0.,
66
+ mask=None,
67
+ x0=None,
68
+ temperature=1.,
69
+ noise_dropout=0.,
70
+ score_corrector=None,
71
+ corrector_kwargs=None,
72
+ verbose=True,
73
+ x_T=None,
74
+ log_every_t=100,
75
+ unconditional_guidance_scale=1.,
76
+ unconditional_conditioning=None, # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
77
+ dynamic_threshold=None,
78
+ ucg_schedule=None,
79
+ **kwargs
80
+ ):
81
+ if conditioning is not None:
82
+ if isinstance(conditioning, dict):
83
+ ctmp = conditioning[list(conditioning.keys())[0]]
84
+ while isinstance(ctmp, list): ctmp = ctmp[0]
85
+ cbs = ctmp.shape[0]
86
+ if cbs != batch_size:
87
+ print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
88
+
89
+ elif isinstance(conditioning, list):
90
+ for ctmp in conditioning:
91
+ if ctmp.shape[0] != batch_size:
92
+ print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
93
+
94
+ else:
95
+ if conditioning.shape[0] != batch_size:
96
+ print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
97
+
98
+ self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
99
+ # sampling
100
+ C, H, W = shape
101
+ size = (batch_size, C, H, W)
102
+ print(f'Data shape for DDIM sampling is {size}, eta {eta}')
103
+
104
+ samples, intermediates = self.ddim_sampling(conditioning, size,
105
+ callback=callback,
106
+ img_callback=img_callback,
107
+ quantize_denoised=quantize_x0,
108
+ mask=mask, x0=x0,
109
+ ddim_use_original_steps=False,
110
+ noise_dropout=noise_dropout,
111
+ temperature=temperature,
112
+ score_corrector=score_corrector,
113
+ corrector_kwargs=corrector_kwargs,
114
+ x_T=x_T,
115
+ log_every_t=log_every_t,
116
+ unconditional_guidance_scale=unconditional_guidance_scale,
117
+ unconditional_conditioning=unconditional_conditioning,
118
+ dynamic_threshold=dynamic_threshold,
119
+ ucg_schedule=ucg_schedule
120
+ )
121
+ return samples, intermediates
122
+
123
+ @torch.no_grad()
124
+ def ddim_sampling(self, cond, shape,
125
+ x_T=None, ddim_use_original_steps=False,
126
+ callback=None, timesteps=None, quantize_denoised=False,
127
+ mask=None, x0=None, img_callback=None, log_every_t=100,
128
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
129
+ unconditional_guidance_scale=1., unconditional_conditioning=None, dynamic_threshold=None,
130
+ ucg_schedule=None):
131
+ device = self.model.betas.device
132
+ b = shape[0]
133
+ if x_T is None:
134
+ img = torch.randn(shape, device=device)
135
+ else:
136
+ img = x_T
137
+
138
+ if timesteps is None:
139
+ timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
140
+ elif timesteps is not None and not ddim_use_original_steps:
141
+ subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
142
+ timesteps = self.ddim_timesteps[:subset_end]
143
+
144
+ intermediates = {'x_inter': [img], 'pred_x0': [img]}
145
+ time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
146
+ total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
147
+ print(f"Running DDIM Sampling with {total_steps} timesteps")
148
+
149
+ iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
150
+
151
+ for i, step in enumerate(iterator):
152
+ index = total_steps - i - 1
153
+ ts = torch.full((b,), step, device=device, dtype=torch.long)
154
+
155
+ if mask is not None:
156
+ assert x0 is not None
157
+ img_orig = self.model.q_sample(x0, ts) # TODO: deterministic forward pass?
158
+ img = img_orig * mask + (1. - mask) * img
159
+
160
+ if ucg_schedule is not None:
161
+ assert len(ucg_schedule) == len(time_range)
162
+ unconditional_guidance_scale = ucg_schedule[i]
163
+
164
+ outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
165
+ quantize_denoised=quantize_denoised, temperature=temperature,
166
+ noise_dropout=noise_dropout, score_corrector=score_corrector,
167
+ corrector_kwargs=corrector_kwargs,
168
+ unconditional_guidance_scale=unconditional_guidance_scale,
169
+ unconditional_conditioning=unconditional_conditioning,
170
+ dynamic_threshold=dynamic_threshold)
171
+ img, pred_x0 = outs
172
+ if callback: callback(i)
173
+ if img_callback: img_callback(pred_x0, i)
174
+
175
+ if index % log_every_t == 0 or index == total_steps - 1:
176
+ intermediates['x_inter'].append(img)
177
+ intermediates['pred_x0'].append(pred_x0)
178
+
179
+ return img, intermediates
180
+
181
+ @torch.no_grad()
182
+ def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
183
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
184
+ unconditional_guidance_scale=1., unconditional_conditioning=None,
185
+ dynamic_threshold=None):
186
+ b, *_, device = *x.shape, x.device
187
+
188
+ if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
189
+ model_output = self.model.apply_model(x, t, c)
190
+ else:
191
+ x_in = torch.cat([x] * 2)
192
+ t_in = torch.cat([t] * 2)
193
+ if isinstance(c, dict):
194
+ assert isinstance(unconditional_conditioning, dict)
195
+ c_in = dict()
196
+ for k in c:
197
+ if isinstance(c[k], list):
198
+ c_in[k] = [torch.cat([
199
+ unconditional_conditioning[k][i],
200
+ c[k][i]]) for i in range(len(c[k]))]
201
+ else:
202
+ c_in[k] = torch.cat([
203
+ unconditional_conditioning[k],
204
+ c[k]])
205
+ elif isinstance(c, list):
206
+ c_in = list()
207
+ assert isinstance(unconditional_conditioning, list)
208
+ for i in range(len(c)):
209
+ c_in.append(torch.cat([unconditional_conditioning[i], c[i]]))
210
+ else:
211
+ c_in = torch.cat([unconditional_conditioning, c])
212
+ model_uncond, model_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
213
+ model_output = model_uncond + unconditional_guidance_scale * (model_t - model_uncond)
214
+
215
+ if self.model.parameterization == "v":
216
+ e_t = self.model.predict_eps_from_z_and_v(x, t, model_output)
217
+ else:
218
+ e_t = model_output
219
+
220
+ if score_corrector is not None:
221
+ assert self.model.parameterization == "eps", 'not implemented'
222
+ e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
223
+
224
+ alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
225
+ alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
226
+ sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
227
+ sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
228
+ # select parameters corresponding to the currently considered timestep
229
+ a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
230
+ a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
231
+ sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
232
+ sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
233
+
234
+ # current prediction for x_0
235
+ if self.model.parameterization != "v":
236
+ pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
237
+ else:
238
+ pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output)
239
+
240
+ if quantize_denoised:
241
+ pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
242
+
243
+ if dynamic_threshold is not None:
244
+ raise NotImplementedError()
245
+
246
+ # direction pointing to x_t
247
+ dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
248
+ noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
249
+ if noise_dropout > 0.:
250
+ noise = torch.nn.functional.dropout(noise, p=noise_dropout)
251
+ x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
252
+ return x_prev, pred_x0
253
+
254
+ @torch.no_grad()
255
+ def encode(self, x0, c, t_enc, use_original_steps=False, return_intermediates=None,
256
+ unconditional_guidance_scale=1.0, unconditional_conditioning=None, callback=None):
257
+ num_reference_steps = self.ddpm_num_timesteps if use_original_steps else self.ddim_timesteps.shape[0]
258
+
259
+ assert t_enc <= num_reference_steps
260
+ num_steps = t_enc
261
+
262
+ if use_original_steps:
263
+ alphas_next = self.alphas_cumprod[:num_steps]
264
+ alphas = self.alphas_cumprod_prev[:num_steps]
265
+ else:
266
+ alphas_next = self.ddim_alphas[:num_steps]
267
+ alphas = torch.tensor(self.ddim_alphas_prev[:num_steps])
268
+
269
+ x_next = x0
270
+ intermediates = []
271
+ inter_steps = []
272
+ for i in tqdm(range(num_steps), desc='Encoding Image'):
273
+ t = torch.full((x0.shape[0],), i, device=self.model.device, dtype=torch.long)
274
+ if unconditional_guidance_scale == 1.:
275
+ noise_pred = self.model.apply_model(x_next, t, c)
276
+ else:
277
+ assert unconditional_conditioning is not None
278
+ e_t_uncond, noise_pred = torch.chunk(
279
+ self.model.apply_model(torch.cat((x_next, x_next)), torch.cat((t, t)),
280
+ torch.cat((unconditional_conditioning, c))), 2)
281
+ noise_pred = e_t_uncond + unconditional_guidance_scale * (noise_pred - e_t_uncond)
282
+
283
+ xt_weighted = (alphas_next[i] / alphas[i]).sqrt() * x_next
284
+ weighted_noise_pred = alphas_next[i].sqrt() * (
285
+ (1 / alphas_next[i] - 1).sqrt() - (1 / alphas[i] - 1).sqrt()) * noise_pred
286
+ x_next = xt_weighted + weighted_noise_pred
287
+ if return_intermediates and i % (
288
+ num_steps // return_intermediates) == 0 and i < num_steps - 1:
289
+ intermediates.append(x_next)
290
+ inter_steps.append(i)
291
+ elif return_intermediates and i >= num_steps - 2:
292
+ intermediates.append(x_next)
293
+ inter_steps.append(i)
294
+ if callback: callback(i)
295
+
296
+ out = {'x_encoded': x_next, 'intermediate_steps': inter_steps}
297
+ if return_intermediates:
298
+ out.update({'intermediates': intermediates})
299
+ return x_next, out
300
+
301
+ @torch.no_grad()
302
+ def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
303
+ # fast, but does not allow for exact reconstruction
304
+ # t serves as an index to gather the correct alphas
305
+ if use_original_steps:
306
+ sqrt_alphas_cumprod = self.sqrt_alphas_cumprod
307
+ sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod
308
+ else:
309
+ sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
310
+ sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
311
+
312
+ if noise is None:
313
+ noise = torch.randn_like(x0)
314
+ return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
315
+ extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise)
316
+
317
+ @torch.no_grad()
318
+ def decode(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
319
+ use_original_steps=False, callback=None):
320
+
321
+ timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
322
+ timesteps = timesteps[:t_start]
323
+
324
+ time_range = np.flip(timesteps)
325
+ total_steps = timesteps.shape[0]
326
+ print(f"Running DDIM Sampling with {total_steps} timesteps")
327
+
328
+ iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
329
+ x_dec = x_latent
330
+ for i, step in enumerate(iterator):
331
+ index = total_steps - i - 1
332
+ ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long)
333
+ x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
334
+ unconditional_guidance_scale=unconditional_guidance_scale,
335
+ unconditional_conditioning=unconditional_conditioning)
336
+ if callback: callback(i)
337
+ return x_dec
ldm/models/diffusion/ddpm.py ADDED
@@ -0,0 +1,1884 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ wild mixture of
3
+ https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
4
+ https://github.com/openai/improved-diffusion/blob/e94489283bb876ac1477d5dd7709bbbd2d9902ce/improved_diffusion/gaussian_diffusion.py
5
+ https://github.com/CompVis/taming-transformers
6
+ -- merci
7
+ """
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import numpy as np
12
+ import pytorch_lightning as pl
13
+ from torch.optim.lr_scheduler import LambdaLR
14
+ from einops import rearrange, repeat
15
+ from contextlib import contextmanager, nullcontext
16
+ from functools import partial
17
+ import itertools
18
+ from tqdm import tqdm
19
+ from torchvision.utils import make_grid
20
+ from pytorch_lightning.utilities.distributed import rank_zero_only
21
+ from omegaconf import ListConfig
22
+
23
+ from ldm.util import log_txt_as_img, exists, default, ismap, isimage, mean_flat, count_params, instantiate_from_config
24
+ from ldm.modules.ema import LitEma
25
+ from ldm.modules.distributions.distributions import normal_kl, DiagonalGaussianDistribution
26
+ from ldm.models.autoencoder import IdentityFirstStage, AutoencoderKL
27
+ from ldm.modules.diffusionmodules.util import make_beta_schedule, extract_into_tensor, noise_like
28
+ from ldm.models.diffusion.ddim import DDIMSampler
29
+
30
+
31
+ __conditioning_keys__ = {'concat': 'c_concat',
32
+ 'crossattn': 'c_crossattn',
33
+ 'adm': 'y'}
34
+
35
+
36
+ def disabled_train(self, mode=True):
37
+ """Overwrite model.train with this function to make sure train/eval mode
38
+ does not change anymore."""
39
+ return self
40
+
41
+
42
+ def uniform_on_device(r1, r2, shape, device):
43
+ return (r1 - r2) * torch.rand(*shape, device=device) + r2
44
+
45
+ '''
46
+ class tree:
47
+ LatentDiffusion (son of DDPM)
48
+ self.model: DiffusionWrapper (defined in DDPM)
49
+ self.diffusion_model: UNet
50
+ self.first_stage_model: AutoencoderKL
51
+ self.cond_stage_model: FrozenOpenCLIP
52
+ '''
53
+
54
+
55
+ class DDPM(pl.LightningModule):
56
+ # classic DDPM with Gaussian diffusion, in image space
57
+ def __init__(self,
58
+ unet_config,
59
+ timesteps=1000,
60
+ beta_schedule="linear",
61
+ loss_type="l2",
62
+ ckpt_path=None,
63
+ ignore_keys=[],
64
+ load_only_unet=False,
65
+ monitor="val/loss",
66
+ use_ema=True,
67
+ first_stage_key="image",
68
+ image_size=256,
69
+ channels=3,
70
+ log_every_t=100,
71
+ clip_denoised=True,
72
+ linear_start=1e-4,
73
+ linear_end=2e-2,
74
+ cosine_s=8e-3,
75
+ given_betas=None,
76
+ original_elbo_weight=0.,
77
+ v_posterior=0., # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
78
+ l_simple_weight=1.,
79
+ conditioning_key=None,
80
+ parameterization="eps", # all assuming fixed variance schedules
81
+ scheduler_config=None,
82
+ use_positional_encodings=False,
83
+ learn_logvar=False,
84
+ logvar_init=0.,
85
+ make_it_fit=False,
86
+ ucg_training=None,
87
+ reset_ema=False,
88
+ reset_num_ema_updates=False,
89
+ ):
90
+ super().__init__()
91
+ assert parameterization in ["eps", "x0", "v"], 'currently only supporting "eps" and "x0" and "v"'
92
+ self.parameterization = parameterization
93
+ print(f"{self.__class__.__name__}: Running in {self.parameterization}-prediction mode")
94
+ self.cond_stage_model = None
95
+ self.clip_denoised = clip_denoised
96
+ self.log_every_t = log_every_t
97
+ self.first_stage_key = first_stage_key
98
+ self.image_size = image_size # try conv?
99
+ self.channels = channels
100
+ self.use_positional_encodings = use_positional_encodings
101
+ self.model = DiffusionWrapper(unet_config, conditioning_key)
102
+ count_params(self.model, verbose=True)
103
+ self.use_ema = use_ema
104
+ if self.use_ema:
105
+ self.model_ema = LitEma(self.model)
106
+ print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
107
+
108
+ self.use_scheduler = scheduler_config is not None
109
+ if self.use_scheduler:
110
+ self.scheduler_config = scheduler_config
111
+
112
+ self.v_posterior = v_posterior
113
+ self.original_elbo_weight = original_elbo_weight
114
+ self.l_simple_weight = l_simple_weight
115
+
116
+ if monitor is not None:
117
+ self.monitor = monitor
118
+ self.make_it_fit = make_it_fit
119
+ if reset_ema: assert exists(ckpt_path)
120
+ if ckpt_path is not None:
121
+ self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet)
122
+ if reset_ema:
123
+ assert self.use_ema
124
+ print(f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
125
+ self.model_ema = LitEma(self.model)
126
+ if reset_num_ema_updates:
127
+ print(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
128
+ assert self.use_ema
129
+ self.model_ema.reset_num_updates()
130
+
131
+ self.register_schedule(given_betas=given_betas, beta_schedule=beta_schedule, timesteps=timesteps,
132
+ linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
133
+
134
+ self.loss_type = loss_type
135
+
136
+ self.learn_logvar = learn_logvar
137
+ self.logvar = torch.full(fill_value=logvar_init, size=(self.num_timesteps,))
138
+ if self.learn_logvar:
139
+ self.logvar = nn.Parameter(self.logvar, requires_grad=True)
140
+
141
+ self.ucg_training = ucg_training or dict()
142
+ if self.ucg_training:
143
+ self.ucg_prng = np.random.RandomState()
144
+
145
+ def register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
146
+ linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
147
+ if exists(given_betas):
148
+ betas = given_betas
149
+ else:
150
+ betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end,
151
+ cosine_s=cosine_s)
152
+ alphas = 1. - betas
153
+ alphas_cumprod = np.cumprod(alphas, axis=0)
154
+ alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
155
+
156
+ timesteps, = betas.shape
157
+ self.num_timesteps = int(timesteps)
158
+ self.linear_start = linear_start
159
+ self.linear_end = linear_end
160
+ assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
161
+
162
+ to_torch = partial(torch.tensor, dtype=torch.float32)
163
+
164
+ self.register_buffer('betas', to_torch(betas))
165
+ self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
166
+ self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
167
+
168
+ # calculations for diffusion q(x_t | x_{t-1}) and others
169
+ self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
170
+ self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
171
+ self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
172
+ self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
173
+ self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
174
+
175
+ # calculations for posterior q(x_{t-1} | x_t, x_0)
176
+ posterior_variance = (1 - self.v_posterior) * betas * (1. - alphas_cumprod_prev) / (
177
+ 1. - alphas_cumprod) + self.v_posterior * betas
178
+ # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
179
+ self.register_buffer('posterior_variance', to_torch(posterior_variance))
180
+ # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
181
+ self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
182
+ self.register_buffer('posterior_mean_coef1', to_torch(
183
+ betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
184
+ self.register_buffer('posterior_mean_coef2', to_torch(
185
+ (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
186
+
187
+ if self.parameterization == "eps":
188
+ lvlb_weights = self.betas ** 2 / (
189
+ 2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod))
190
+ elif self.parameterization == "x0":
191
+ lvlb_weights = 0.5 * np.sqrt(torch.Tensor(alphas_cumprod)) / (2. * 1 - torch.Tensor(alphas_cumprod))
192
+ elif self.parameterization == "v":
193
+ lvlb_weights = torch.ones_like(self.betas ** 2 / (
194
+ 2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod)))
195
+ else:
196
+ raise NotImplementedError("mu not supported")
197
+ lvlb_weights[0] = lvlb_weights[1]
198
+ self.register_buffer('lvlb_weights', lvlb_weights, persistent=False)
199
+ assert not torch.isnan(self.lvlb_weights).all()
200
+
201
+ @contextmanager
202
+ def ema_scope(self, context=None):
203
+ if self.use_ema:
204
+ self.model_ema.store(self.model.parameters())
205
+ self.model_ema.copy_to(self.model)
206
+ if context is not None:
207
+ print(f"{context}: Switched to EMA weights")
208
+ try:
209
+ yield None
210
+ finally:
211
+ if self.use_ema:
212
+ self.model_ema.restore(self.model.parameters())
213
+ if context is not None:
214
+ print(f"{context}: Restored training weights")
215
+
216
+ @torch.no_grad()
217
+ def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
218
+ sd = torch.load(path, map_location="cpu")
219
+ if "state_dict" in list(sd.keys()):
220
+ sd = sd["state_dict"]
221
+ keys = list(sd.keys())
222
+ for k in keys:
223
+ for ik in ignore_keys:
224
+ if k.startswith(ik):
225
+ print("Deleting key {} from state_dict.".format(k))
226
+ del sd[k]
227
+ if self.make_it_fit:
228
+ n_params = len([name for name, _ in
229
+ itertools.chain(self.named_parameters(),
230
+ self.named_buffers())])
231
+ for name, param in tqdm(
232
+ itertools.chain(self.named_parameters(),
233
+ self.named_buffers()),
234
+ desc="Fitting old weights to new weights",
235
+ total=n_params
236
+ ):
237
+ if not name in sd:
238
+ continue
239
+ old_shape = sd[name].shape
240
+ new_shape = param.shape
241
+ assert len(old_shape) == len(new_shape)
242
+ if len(new_shape) > 2:
243
+ # we only modify first two axes
244
+ assert new_shape[2:] == old_shape[2:]
245
+ # assumes first axis corresponds to output dim
246
+ if not new_shape == old_shape:
247
+ new_param = param.clone()
248
+ old_param = sd[name]
249
+ if len(new_shape) == 1:
250
+ for i in range(new_param.shape[0]):
251
+ new_param[i] = old_param[i % old_shape[0]]
252
+ elif len(new_shape) >= 2:
253
+ for i in range(new_param.shape[0]):
254
+ for j in range(new_param.shape[1]):
255
+ new_param[i, j] = old_param[i % old_shape[0], j % old_shape[1]]
256
+
257
+ n_used_old = torch.ones(old_shape[1])
258
+ for j in range(new_param.shape[1]):
259
+ n_used_old[j % old_shape[1]] += 1
260
+ n_used_new = torch.zeros(new_shape[1])
261
+ for j in range(new_param.shape[1]):
262
+ n_used_new[j] = n_used_old[j % old_shape[1]]
263
+
264
+ n_used_new = n_used_new[None, :]
265
+ while len(n_used_new.shape) < len(new_shape):
266
+ n_used_new = n_used_new.unsqueeze(-1)
267
+ new_param /= n_used_new
268
+
269
+ sd[name] = new_param
270
+
271
+ missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
272
+ sd, strict=False)
273
+ print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
274
+ if len(missing) > 0:
275
+ print(f"Missing Keys:\n {missing}")
276
+ if len(unexpected) > 0:
277
+ print(f"\nUnexpected Keys:\n {unexpected}")
278
+
279
+ def q_mean_variance(self, x_start, t):
280
+ """
281
+ Get the distribution q(x_t | x_0).
282
+ :param x_start: the [N x C x ...] tensor of noiseless inputs.
283
+ :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
284
+ :return: A tuple (mean, variance, log_variance), all of x_start's shape.
285
+ """
286
+ mean = (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start)
287
+ variance = extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
288
+ log_variance = extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
289
+ return mean, variance, log_variance
290
+
291
+ def predict_start_from_noise(self, x_t, t, noise):
292
+ return (
293
+ extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
294
+ extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
295
+ )
296
+
297
+ def predict_start_from_z_and_v(self, x_t, t, v):
298
+ # self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
299
+ # self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
300
+ return (
301
+ extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * x_t -
302
+ extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * v
303
+ )
304
+
305
+ def predict_eps_from_z_and_v(self, x_t, t, v):
306
+ return (
307
+ extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * v +
308
+ extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * x_t
309
+ )
310
+
311
+ def q_posterior(self, x_start, x_t, t):
312
+ posterior_mean = (
313
+ extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start +
314
+ extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
315
+ )
316
+ posterior_variance = extract_into_tensor(self.posterior_variance, t, x_t.shape)
317
+ posterior_log_variance_clipped = extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
318
+ return posterior_mean, posterior_variance, posterior_log_variance_clipped
319
+
320
+ def p_mean_variance(self, x, t, clip_denoised: bool):
321
+ model_out = self.model(x, t)
322
+ if self.parameterization == "eps":
323
+ x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
324
+ elif self.parameterization == "x0":
325
+ x_recon = model_out
326
+ if clip_denoised:
327
+ x_recon.clamp_(-1., 1.)
328
+
329
+ model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
330
+ return model_mean, posterior_variance, posterior_log_variance
331
+
332
+ @torch.no_grad()
333
+ def p_sample(self, x, t, clip_denoised=True, repeat_noise=False):
334
+ b, *_, device = *x.shape, x.device
335
+ model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, clip_denoised=clip_denoised)
336
+ noise = noise_like(x.shape, device, repeat_noise)
337
+ # no noise when t == 0
338
+ nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
339
+ return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
340
+
341
+ @torch.no_grad()
342
+ def p_sample_loop(self, shape, return_intermediates=False):
343
+ device = self.betas.device
344
+ b = shape[0]
345
+ img = torch.randn(shape, device=device)
346
+ intermediates = [img]
347
+ for i in tqdm(reversed(range(0, self.num_timesteps)), desc='Sampling t', total=self.num_timesteps):
348
+ img = self.p_sample(img, torch.full((b,), i, device=device, dtype=torch.long),
349
+ clip_denoised=self.clip_denoised)
350
+ if i % self.log_every_t == 0 or i == self.num_timesteps - 1:
351
+ intermediates.append(img)
352
+ if return_intermediates:
353
+ return img, intermediates
354
+ return img
355
+
356
+ @torch.no_grad()
357
+ def sample(self, batch_size=16, return_intermediates=False):
358
+ image_size = self.image_size
359
+ channels = self.channels
360
+ return self.p_sample_loop((batch_size, channels, image_size, image_size),
361
+ return_intermediates=return_intermediates)
362
+
363
+ def q_sample(self, x_start, t, noise=None):
364
+ noise = default(noise, lambda: torch.randn_like(x_start))
365
+ return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
366
+ extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
367
+
368
+ def get_v(self, x, noise, t):
369
+ return (
370
+ extract_into_tensor(self.sqrt_alphas_cumprod, t, x.shape) * noise -
371
+ extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x.shape) * x
372
+ )
373
+
374
+ def get_loss(self, pred, target, mean=True):
375
+ if self.loss_type == 'l1':
376
+ loss = (target - pred).abs()
377
+ if mean:
378
+ loss = loss.mean()
379
+ elif self.loss_type == 'l2':
380
+ if mean:
381
+ loss = torch.nn.functional.mse_loss(target, pred)
382
+ else:
383
+ loss = torch.nn.functional.mse_loss(target, pred, reduction='none')
384
+ else:
385
+ raise NotImplementedError("unknown loss type '{loss_type}'")
386
+
387
+ return loss
388
+
389
+ def p_losses(self, x_start, t, noise=None):
390
+ noise = default(noise, lambda: torch.randn_like(x_start))
391
+ x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
392
+ model_out = self.model(x_noisy, t)
393
+
394
+ loss_dict = {}
395
+ if self.parameterization == "eps":
396
+ target = noise
397
+ elif self.parameterization == "x0":
398
+ target = x_start
399
+ elif self.parameterization == "v":
400
+ target = self.get_v(x_start, noise, t)
401
+ else:
402
+ raise NotImplementedError(f"Parameterization {self.parameterization} not yet supported")
403
+
404
+ loss = self.get_loss(model_out, target, mean=False).mean(dim=[1, 2, 3])
405
+
406
+ log_prefix = 'train' if self.training else 'val'
407
+
408
+ loss_dict.update({f'{log_prefix}/loss_simple': loss.mean()})
409
+ loss_simple = loss.mean() * self.l_simple_weight
410
+
411
+ loss_vlb = (self.lvlb_weights[t] * loss).mean()
412
+ loss_dict.update({f'{log_prefix}/loss_vlb': loss_vlb})
413
+
414
+ loss = loss_simple + self.original_elbo_weight * loss_vlb
415
+
416
+ loss_dict.update({f'{log_prefix}/loss': loss})
417
+
418
+ return loss, loss_dict
419
+
420
+ def forward(self, x, *args, **kwargs):
421
+ # b, c, h, w, device, img_size, = *x.shape, x.device, self.image_size
422
+ # assert h == img_size and w == img_size, f'height and width of image must be {img_size}'
423
+ t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
424
+ return self.p_losses(x, t, *args, **kwargs)
425
+
426
+ def get_input(self, batch, k):
427
+ x = batch[k]
428
+ if len(x.shape) == 3:
429
+ x = x[..., None]
430
+ x = rearrange(x, 'b h w c -> b c h w')
431
+ x = x.to(memory_format=torch.contiguous_format).float()
432
+ return x
433
+
434
+ def shared_step(self, batch):
435
+ x = self.get_input(batch, self.first_stage_key)
436
+ loss, loss_dict = self(x)
437
+ return loss, loss_dict
438
+
439
+ def training_step(self, batch, batch_idx):
440
+ for k in self.ucg_training:
441
+ p = self.ucg_training[k]["p"]
442
+ val = self.ucg_training[k]["val"]
443
+ if val is None:
444
+ val = ""
445
+ for i in range(len(batch[k])):
446
+ if self.ucg_prng.choice(2, p=[1 - p, p]):
447
+ batch[k][i] = val
448
+
449
+ loss, loss_dict = self.shared_step(batch)
450
+
451
+ self.log_dict(loss_dict, prog_bar=True,
452
+ logger=True, on_step=True, on_epoch=True)
453
+
454
+ self.log("global_step", self.global_step,
455
+ prog_bar=True, logger=True, on_step=True, on_epoch=False)
456
+
457
+ if self.use_scheduler:
458
+ lr = self.optimizers().param_groups[0]['lr']
459
+ self.log('lr_abs', lr, prog_bar=True, logger=True, on_step=True, on_epoch=False)
460
+
461
+ return loss
462
+
463
+ @torch.no_grad()
464
+ def validation_step(self, batch, batch_idx):
465
+ _, loss_dict_no_ema = self.shared_step(batch)
466
+ with self.ema_scope():
467
+ _, loss_dict_ema = self.shared_step(batch)
468
+ loss_dict_ema = {key + '_ema': loss_dict_ema[key] for key in loss_dict_ema}
469
+ self.log_dict(loss_dict_no_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
470
+ self.log_dict(loss_dict_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True)
471
+
472
+ def on_train_batch_end(self, *args, **kwargs):
473
+ if self.use_ema:
474
+ self.model_ema(self.model)
475
+
476
+ def _get_rows_from_list(self, samples):
477
+ n_imgs_per_row = len(samples)
478
+ denoise_grid = rearrange(samples, 'n b c h w -> b n c h w')
479
+ denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
480
+ denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
481
+ return denoise_grid
482
+
483
+ @torch.no_grad()
484
+ def log_images(self, batch, N=8, n_row=2, sample=True, return_keys=None, **kwargs):
485
+ log = dict()
486
+ x = self.get_input(batch, self.first_stage_key)
487
+ N = min(x.shape[0], N)
488
+ n_row = min(x.shape[0], n_row)
489
+ x = x.to(self.device)[:N]
490
+ log["inputs"] = x
491
+
492
+ # get diffusion row
493
+ diffusion_row = list()
494
+ x_start = x[:n_row]
495
+
496
+ for t in range(self.num_timesteps):
497
+ if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
498
+ t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
499
+ t = t.to(self.device).long()
500
+ noise = torch.randn_like(x_start)
501
+ x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
502
+ diffusion_row.append(x_noisy)
503
+
504
+ log["diffusion_row"] = self._get_rows_from_list(diffusion_row)
505
+
506
+ if sample:
507
+ # get denoise row
508
+ with self.ema_scope("Plotting"):
509
+ samples, denoise_row = self.sample(batch_size=N, return_intermediates=True)
510
+
511
+ log["samples"] = samples
512
+ log["denoise_row"] = self._get_rows_from_list(denoise_row)
513
+
514
+ if return_keys:
515
+ if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
516
+ return log
517
+ else:
518
+ return {key: log[key] for key in return_keys}
519
+ return log
520
+
521
+ def configure_optimizers(self):
522
+ lr = self.learning_rate
523
+ params = list(self.model.parameters())
524
+ if self.learn_logvar:
525
+ params = params + [self.logvar]
526
+ opt = torch.optim.AdamW(params, lr=lr)
527
+ return opt
528
+
529
+
530
+ class LatentDiffusion(DDPM):
531
+ """main class"""
532
+
533
+ def __init__(self,
534
+ first_stage_config,
535
+ cond_stage_config,
536
+ num_timesteps_cond=None,
537
+ cond_stage_key="image",
538
+ cond_stage_trainable=False,
539
+ concat_mode=True,
540
+ cond_stage_forward=None,
541
+ conditioning_key=None,
542
+ scale_factor=1.0,
543
+ scale_by_std=False,
544
+ force_null_conditioning=False,
545
+ *args, **kwargs):
546
+ self.force_null_conditioning = force_null_conditioning
547
+ self.num_timesteps_cond = default(num_timesteps_cond, 1)
548
+ self.scale_by_std = scale_by_std
549
+ assert self.num_timesteps_cond <= kwargs['timesteps']
550
+ # for backwards compatibility after implementation of DiffusionWrapper
551
+ if conditioning_key is None:
552
+ conditioning_key = 'concat' if concat_mode else 'crossattn'
553
+ if cond_stage_config == '__is_unconditional__' and not self.force_null_conditioning:
554
+ conditioning_key = None
555
+ ckpt_path = kwargs.pop("ckpt_path", None)
556
+ reset_ema = kwargs.pop("reset_ema", False)
557
+ reset_num_ema_updates = kwargs.pop("reset_num_ema_updates", False)
558
+ ignore_keys = kwargs.pop("ignore_keys", [])
559
+ super().__init__(conditioning_key=conditioning_key, *args, **kwargs)
560
+ self.concat_mode = concat_mode
561
+ self.cond_stage_trainable = cond_stage_trainable
562
+ self.cond_stage_key = cond_stage_key
563
+ try:
564
+ self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1
565
+ except:
566
+ self.num_downs = 0
567
+ if not scale_by_std:
568
+ self.scale_factor = scale_factor
569
+ else:
570
+ self.register_buffer('scale_factor', torch.tensor(scale_factor))
571
+ self.instantiate_first_stage(first_stage_config) # AutoencoderKL
572
+ self.instantiate_cond_stage(cond_stage_config) # FrozenOpenCLIPEmbedder
573
+ self.cond_stage_forward = cond_stage_forward
574
+ self.clip_denoised = False
575
+ self.bbox_tokenizer = None
576
+
577
+ self.restarted_from_ckpt = False
578
+ if ckpt_path is not None:
579
+ self.init_from_ckpt(ckpt_path, ignore_keys)
580
+ self.restarted_from_ckpt = True
581
+ if reset_ema:
582
+ assert self.use_ema
583
+ print(
584
+ f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
585
+ self.model_ema = LitEma(self.model)
586
+ if reset_num_ema_updates:
587
+ print(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
588
+ assert self.use_ema
589
+ self.model_ema.reset_num_updates()
590
+
591
+ def make_cond_schedule(self, ):
592
+ self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
593
+ ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
594
+ self.cond_ids[:self.num_timesteps_cond] = ids
595
+
596
+ @rank_zero_only
597
+ @torch.no_grad()
598
+ def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
599
+ # only for very first batch
600
+ if self.scale_by_std and self.current_epoch == 0 and self.global_step == 0 and batch_idx == 0 and not self.restarted_from_ckpt:
601
+ assert self.scale_factor == 1., 'rather not use custom rescaling and std-rescaling simultaneously'
602
+ # set rescale weight to 1./std of encodings
603
+ print("### USING STD-RESCALING ###")
604
+ x = super().get_input(batch, self.first_stage_key)
605
+ x = x.to(self.device)
606
+ encoder_posterior = self.encode_first_stage(x)
607
+ z = self.get_first_stage_encoding(encoder_posterior).detach()
608
+ del self.scale_factor
609
+ self.register_buffer('scale_factor', 1. / z.flatten().std())
610
+ print(f"setting self.scale_factor to {self.scale_factor}")
611
+ print("### USING STD-RESCALING ###")
612
+
613
+ def register_schedule(self,
614
+ given_betas=None, beta_schedule="linear", timesteps=1000,
615
+ linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
616
+ super().register_schedule(given_betas, beta_schedule, timesteps, linear_start, linear_end, cosine_s)
617
+
618
+ self.shorten_cond_schedule = self.num_timesteps_cond > 1
619
+ if self.shorten_cond_schedule:
620
+ self.make_cond_schedule()
621
+
622
+ def instantiate_first_stage(self, config):
623
+ model = instantiate_from_config(config)
624
+ self.first_stage_model = model
625
+ # self.first_stage_model = model.eval()
626
+ # self.first_stage_model.train = disabled_train
627
+ for param in self.first_stage_model.parameters():
628
+ param.requires_grad = False
629
+
630
+ def instantiate_cond_stage(self, config):
631
+ if not self.cond_stage_trainable:
632
+ if config == "__is_first_stage__":
633
+ print("Using first stage also as cond stage.")
634
+ self.cond_stage_model = self.first_stage_model
635
+ elif config == "__is_unconditional__":
636
+ print(f"Training {self.__class__.__name__} as an unconditional model.")
637
+ self.cond_stage_model = None
638
+ # self.be_unconditional = True
639
+ else:
640
+ model = instantiate_from_config(config)
641
+ self.cond_stage_model = model
642
+ # self.cond_stage_model = model.eval()
643
+ # self.cond_stage_model.train = disabled_train
644
+ for param in self.cond_stage_model.parameters():
645
+ param.requires_grad = False
646
+ else:
647
+ assert config != '__is_first_stage__'
648
+ assert config != '__is_unconditional__'
649
+ model = instantiate_from_config(config)
650
+ self.cond_stage_model = model
651
+
652
+ def _get_denoise_row_from_list(self, samples, desc='', force_no_decoder_quantization=False):
653
+ denoise_row = []
654
+ for zd in tqdm(samples, desc=desc):
655
+ denoise_row.append(self.decode_first_stage(zd.to(self.device),
656
+ force_not_quantize=force_no_decoder_quantization))
657
+ n_imgs_per_row = len(denoise_row)
658
+ denoise_row = torch.stack(denoise_row) # n_log_step, n_row, C, H, W
659
+ denoise_grid = rearrange(denoise_row, 'n b c h w -> b n c h w')
660
+ denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
661
+ denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
662
+ return denoise_grid
663
+
664
+ def get_first_stage_encoding(self, encoder_posterior):
665
+ if isinstance(encoder_posterior, DiagonalGaussianDistribution):
666
+ z = encoder_posterior.sample()
667
+ elif isinstance(encoder_posterior, torch.Tensor):
668
+ z = encoder_posterior
669
+ else:
670
+ raise NotImplementedError(f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented")
671
+ return self.scale_factor * z
672
+
673
+ def get_learned_conditioning(self, c):
674
+ if self.cond_stage_forward is None:
675
+ if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode):
676
+ c = self.cond_stage_model.encode(c)
677
+ if isinstance(c, DiagonalGaussianDistribution):
678
+ c = c.mode()
679
+ else:
680
+ c = self.cond_stage_model(c)
681
+ else:
682
+ assert hasattr(self.cond_stage_model, self.cond_stage_forward)
683
+ c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
684
+ return c
685
+
686
+ def meshgrid(self, h, w):
687
+ y = torch.arange(0, h).view(h, 1, 1).repeat(1, w, 1)
688
+ x = torch.arange(0, w).view(1, w, 1).repeat(h, 1, 1)
689
+
690
+ arr = torch.cat([y, x], dim=-1)
691
+ return arr
692
+
693
+ def delta_border(self, h, w):
694
+ """
695
+ :param h: height
696
+ :param w: width
697
+ :return: normalized distance to image border,
698
+ wtith min distance = 0 at border and max dist = 0.5 at image center
699
+ """
700
+ lower_right_corner = torch.tensor([h - 1, w - 1]).view(1, 1, 2)
701
+ arr = self.meshgrid(h, w) / lower_right_corner
702
+ dist_left_up = torch.min(arr, dim=-1, keepdims=True)[0]
703
+ dist_right_down = torch.min(1 - arr, dim=-1, keepdims=True)[0]
704
+ edge_dist = torch.min(torch.cat([dist_left_up, dist_right_down], dim=-1), dim=-1)[0]
705
+ return edge_dist
706
+
707
+ def get_weighting(self, h, w, Ly, Lx, device):
708
+ weighting = self.delta_border(h, w)
709
+ weighting = torch.clip(weighting, self.split_input_params["clip_min_weight"],
710
+ self.split_input_params["clip_max_weight"], )
711
+ weighting = weighting.view(1, h * w, 1).repeat(1, 1, Ly * Lx).to(device)
712
+
713
+ if self.split_input_params["tie_braker"]:
714
+ L_weighting = self.delta_border(Ly, Lx)
715
+ L_weighting = torch.clip(L_weighting,
716
+ self.split_input_params["clip_min_tie_weight"],
717
+ self.split_input_params["clip_max_tie_weight"])
718
+
719
+ L_weighting = L_weighting.view(1, 1, Ly * Lx).to(device)
720
+ weighting = weighting * L_weighting
721
+ return weighting
722
+
723
+ def get_fold_unfold(self, x, kernel_size, stride, uf=1, df=1): # todo load once not every time, shorten code
724
+ """
725
+ :param x: img of size (bs, c, h, w)
726
+ :return: n img crops of size (n, bs, c, kernel_size[0], kernel_size[1])
727
+ """
728
+ bs, nc, h, w = x.shape
729
+
730
+ # number of crops in image
731
+ Ly = (h - kernel_size[0]) // stride[0] + 1
732
+ Lx = (w - kernel_size[1]) // stride[1] + 1
733
+
734
+ if uf == 1 and df == 1:
735
+ fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
736
+ unfold = torch.nn.Unfold(**fold_params)
737
+
738
+ fold = torch.nn.Fold(output_size=x.shape[2:], **fold_params)
739
+
740
+ weighting = self.get_weighting(kernel_size[0], kernel_size[1], Ly, Lx, x.device).to(x.dtype)
741
+ normalization = fold(weighting).view(1, 1, h, w) # normalizes the overlap
742
+ weighting = weighting.view((1, 1, kernel_size[0], kernel_size[1], Ly * Lx))
743
+
744
+ elif uf > 1 and df == 1:
745
+ fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
746
+ unfold = torch.nn.Unfold(**fold_params)
747
+
748
+ fold_params2 = dict(kernel_size=(kernel_size[0] * uf, kernel_size[0] * uf),
749
+ dilation=1, padding=0,
750
+ stride=(stride[0] * uf, stride[1] * uf))
751
+ fold = torch.nn.Fold(output_size=(x.shape[2] * uf, x.shape[3] * uf), **fold_params2)
752
+
753
+ weighting = self.get_weighting(kernel_size[0] * uf, kernel_size[1] * uf, Ly, Lx, x.device).to(x.dtype)
754
+ normalization = fold(weighting).view(1, 1, h * uf, w * uf) # normalizes the overlap
755
+ weighting = weighting.view((1, 1, kernel_size[0] * uf, kernel_size[1] * uf, Ly * Lx))
756
+
757
+ elif df > 1 and uf == 1:
758
+ fold_params = dict(kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
759
+ unfold = torch.nn.Unfold(**fold_params)
760
+
761
+ fold_params2 = dict(kernel_size=(kernel_size[0] // df, kernel_size[0] // df),
762
+ dilation=1, padding=0,
763
+ stride=(stride[0] // df, stride[1] // df))
764
+ fold = torch.nn.Fold(output_size=(x.shape[2] // df, x.shape[3] // df), **fold_params2)
765
+
766
+ weighting = self.get_weighting(kernel_size[0] // df, kernel_size[1] // df, Ly, Lx, x.device).to(x.dtype)
767
+ normalization = fold(weighting).view(1, 1, h // df, w // df) # normalizes the overlap
768
+ weighting = weighting.view((1, 1, kernel_size[0] // df, kernel_size[1] // df, Ly * Lx))
769
+
770
+ else:
771
+ raise NotImplementedError
772
+
773
+ return fold, unfold, normalization, weighting
774
+
775
+ @torch.no_grad()
776
+ def get_input(self, batch, k, return_first_stage_outputs=False, force_c_encode=False,
777
+ cond_key=None, return_original_cond=False, bs=None, return_x=False):
778
+ x = super().get_input(batch, k)
779
+ if bs is not None:
780
+ x = x[:bs]
781
+ x = x.to(self.device)
782
+ encoder_posterior = self.encode_first_stage(x)
783
+ z = self.get_first_stage_encoding(encoder_posterior).detach()
784
+
785
+ if self.model.conditioning_key is not None and not self.force_null_conditioning:
786
+ if cond_key is None:
787
+ cond_key = self.cond_stage_key
788
+ if cond_key != self.first_stage_key:
789
+ if cond_key in ['caption', 'coordinates_bbox', "txt"]:
790
+ xc = batch[cond_key]
791
+ elif cond_key in ['class_label', 'cls']:
792
+ xc = batch
793
+ else:
794
+ xc = super().get_input(batch, cond_key).to(self.device)
795
+ else:
796
+ xc = x
797
+ if not self.cond_stage_trainable or force_c_encode:
798
+ if isinstance(xc, dict) or isinstance(xc, list):
799
+ c = self.get_learned_conditioning(xc)
800
+ else:
801
+ c = self.get_learned_conditioning(xc.to(self.device))
802
+ else:
803
+ c = xc
804
+ if bs is not None:
805
+ c = c[:bs]
806
+
807
+ if self.use_positional_encodings:
808
+ pos_x, pos_y = self.compute_latent_shifts(batch)
809
+ ckey = __conditioning_keys__[self.model.conditioning_key]
810
+ c = {ckey: c, 'pos_x': pos_x, 'pos_y': pos_y}
811
+
812
+ else:
813
+ c = None
814
+ xc = None
815
+ if self.use_positional_encodings:
816
+ pos_x, pos_y = self.compute_latent_shifts(batch)
817
+ c = {'pos_x': pos_x, 'pos_y': pos_y}
818
+ out = [z, c]
819
+ if return_first_stage_outputs:
820
+ xrec = self.decode_first_stage(z)
821
+ out.extend([x, xrec])
822
+ if return_x:
823
+ out.extend([x])
824
+ if return_original_cond:
825
+ out.append(xc)
826
+ return out
827
+
828
+ @torch.no_grad()
829
+ def decode_first_stage(self, z, predict_cids=False, force_not_quantize=False):
830
+ if predict_cids:
831
+ if z.dim() == 4:
832
+ z = torch.argmax(z.exp(), dim=1).long()
833
+ z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None)
834
+ z = rearrange(z, 'b h w c -> b c h w').contiguous()
835
+
836
+ z = 1. / self.scale_factor * z
837
+ return self.first_stage_model.decode(z)
838
+
839
+ @torch.no_grad()
840
+ def encode_first_stage(self, x):
841
+ return self.first_stage_model.encode(x)
842
+
843
+ def shared_step(self, batch, **kwargs):
844
+ x, c = self.get_input(batch, self.first_stage_key)
845
+ loss = self(x, c)
846
+ return loss
847
+
848
+ def forward(self, x, c, *args, **kwargs):
849
+ t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
850
+ if self.model.conditioning_key is not None:
851
+ assert c is not None
852
+ if self.cond_stage_trainable:
853
+ c = self.get_learned_conditioning(c)
854
+ if self.shorten_cond_schedule: # TODO: drop this option
855
+ tc = self.cond_ids[t].to(self.device)
856
+ c = self.q_sample(x_start=c, t=tc, noise=torch.randn_like(c.float()))
857
+ return self.p_losses(x, c, t, *args, **kwargs)
858
+
859
+ def apply_model(self, x_noisy, t, cond, return_ids=False):
860
+ if isinstance(cond, dict):
861
+ # hybrid case, cond is expected to be a dict
862
+ pass
863
+ else:
864
+ if not isinstance(cond, list):
865
+ cond = [cond]
866
+ key = 'c_concat' if self.model.conditioning_key == 'concat' else 'c_crossattn'
867
+ cond = {key: cond}
868
+
869
+ x_recon = self.model(x_noisy, t, **cond)
870
+
871
+ if isinstance(x_recon, tuple) and not return_ids:
872
+ return x_recon[0]
873
+ else:
874
+ return x_recon
875
+
876
+ def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
877
+ return (extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart) / \
878
+ extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
879
+
880
+ def _prior_bpd(self, x_start):
881
+ """
882
+ Get the prior KL term for the variational lower-bound, measured in
883
+ bits-per-dim.
884
+ This term can't be optimized, as it only depends on the encoder.
885
+ :param x_start: the [N x C x ...] tensor of inputs.
886
+ :return: a batch of [N] KL values (in bits), one per batch element.
887
+ """
888
+ batch_size = x_start.shape[0]
889
+ t = torch.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
890
+ qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
891
+ kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
892
+ return mean_flat(kl_prior) / np.log(2.0)
893
+
894
+ def p_losses(self, x_start, cond, t, noise=None):
895
+ noise = default(noise, lambda: torch.randn_like(x_start))
896
+ x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
897
+ model_output = self.apply_model(x_noisy, t, cond)
898
+
899
+ loss_dict = {}
900
+ prefix = 'train' if self.training else 'val'
901
+
902
+ if self.parameterization == "x0":
903
+ target = x_start
904
+ elif self.parameterization == "eps":
905
+ target = noise
906
+ elif self.parameterization == "v":
907
+ target = self.get_v(x_start, noise, t)
908
+ else:
909
+ raise NotImplementedError()
910
+
911
+ loss_simple = self.get_loss(model_output, target, mean=False).mean([1, 2, 3])
912
+ loss_dict.update({f'{prefix}/loss_simple': loss_simple.mean()})
913
+
914
+ logvar_t = self.logvar[t].to(self.device)
915
+ loss = loss_simple / torch.exp(logvar_t) + logvar_t
916
+ # loss = loss_simple / torch.exp(self.logvar) + self.logvar
917
+ if self.learn_logvar:
918
+ loss_dict.update({f'{prefix}/loss_gamma': loss.mean()})
919
+ loss_dict.update({'logvar': self.logvar.data.mean()})
920
+
921
+ loss = self.l_simple_weight * loss.mean()
922
+
923
+ loss_vlb = self.get_loss(model_output, target, mean=False).mean(dim=(1, 2, 3))
924
+ loss_vlb = (self.lvlb_weights[t] * loss_vlb).mean()
925
+ loss_dict.update({f'{prefix}/loss_vlb': loss_vlb})
926
+ loss += (self.original_elbo_weight * loss_vlb)
927
+ loss_dict.update({f'{prefix}/loss': loss})
928
+
929
+ return loss, loss_dict
930
+
931
+ def p_mean_variance(self, x, c, t, clip_denoised: bool, return_codebook_ids=False, quantize_denoised=False,
932
+ return_x0=False, score_corrector=None, corrector_kwargs=None):
933
+ t_in = t
934
+ model_out = self.apply_model(x, t_in, c, return_ids=return_codebook_ids)
935
+
936
+ if score_corrector is not None:
937
+ assert self.parameterization == "eps"
938
+ model_out = score_corrector.modify_score(self, model_out, x, t, c, **corrector_kwargs)
939
+
940
+ if return_codebook_ids:
941
+ model_out, logits = model_out
942
+
943
+ if self.parameterization == "eps":
944
+ x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
945
+ elif self.parameterization == "x0":
946
+ x_recon = model_out
947
+ else:
948
+ raise NotImplementedError()
949
+
950
+ if clip_denoised:
951
+ x_recon.clamp_(-1., 1.)
952
+ if quantize_denoised:
953
+ x_recon, _, [_, _, indices] = self.first_stage_model.quantize(x_recon)
954
+ model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
955
+ if return_codebook_ids:
956
+ return model_mean, posterior_variance, posterior_log_variance, logits
957
+ elif return_x0:
958
+ return model_mean, posterior_variance, posterior_log_variance, x_recon
959
+ else:
960
+ return model_mean, posterior_variance, posterior_log_variance
961
+
962
+ @torch.no_grad()
963
+ def p_sample(self, x, c, t, clip_denoised=False, repeat_noise=False,
964
+ return_codebook_ids=False, quantize_denoised=False, return_x0=False,
965
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None):
966
+ b, *_, device = *x.shape, x.device
967
+ outputs = self.p_mean_variance(x=x, c=c, t=t, clip_denoised=clip_denoised,
968
+ return_codebook_ids=return_codebook_ids,
969
+ quantize_denoised=quantize_denoised,
970
+ return_x0=return_x0,
971
+ score_corrector=score_corrector, corrector_kwargs=corrector_kwargs)
972
+ if return_codebook_ids:
973
+ raise DeprecationWarning("Support dropped.")
974
+ model_mean, _, model_log_variance, logits = outputs
975
+ elif return_x0:
976
+ model_mean, _, model_log_variance, x0 = outputs
977
+ else:
978
+ model_mean, _, model_log_variance = outputs
979
+
980
+ noise = noise_like(x.shape, device, repeat_noise) * temperature
981
+ if noise_dropout > 0.:
982
+ noise = torch.nn.functional.dropout(noise, p=noise_dropout)
983
+ # no noise when t == 0
984
+ nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
985
+
986
+ if return_codebook_ids:
987
+ return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, logits.argmax(dim=1)
988
+ if return_x0:
989
+ return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, x0
990
+ else:
991
+ return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
992
+
993
+ @torch.no_grad()
994
+ def progressive_denoising(self, cond, shape, verbose=True, callback=None, quantize_denoised=False,
995
+ img_callback=None, mask=None, x0=None, temperature=1., noise_dropout=0.,
996
+ score_corrector=None, corrector_kwargs=None, batch_size=None, x_T=None, start_T=None,
997
+ log_every_t=None):
998
+ if not log_every_t:
999
+ log_every_t = self.log_every_t
1000
+ timesteps = self.num_timesteps
1001
+ if batch_size is not None:
1002
+ b = batch_size if batch_size is not None else shape[0]
1003
+ shape = [batch_size] + list(shape)
1004
+ else:
1005
+ b = batch_size = shape[0]
1006
+ if x_T is None:
1007
+ img = torch.randn(shape, device=self.device)
1008
+ else:
1009
+ img = x_T
1010
+ intermediates = []
1011
+ if cond is not None:
1012
+ if isinstance(cond, dict):
1013
+ cond = {key: cond[key][:batch_size] if not isinstance(cond[key], list) else
1014
+ list(map(lambda x: x[:batch_size], cond[key])) for key in cond}
1015
+ else:
1016
+ cond = [c[:batch_size] for c in cond] if isinstance(cond, list) else cond[:batch_size]
1017
+
1018
+ if start_T is not None:
1019
+ timesteps = min(timesteps, start_T)
1020
+ iterator = tqdm(reversed(range(0, timesteps)), desc='Progressive Generation',
1021
+ total=timesteps) if verbose else reversed(
1022
+ range(0, timesteps))
1023
+ if type(temperature) == float:
1024
+ temperature = [temperature] * timesteps
1025
+
1026
+ for i in iterator:
1027
+ ts = torch.full((b,), i, device=self.device, dtype=torch.long)
1028
+ if self.shorten_cond_schedule:
1029
+ assert self.model.conditioning_key != 'hybrid'
1030
+ tc = self.cond_ids[ts].to(cond.device)
1031
+ cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
1032
+
1033
+ img, x0_partial = self.p_sample(img, cond, ts,
1034
+ clip_denoised=self.clip_denoised,
1035
+ quantize_denoised=quantize_denoised, return_x0=True,
1036
+ temperature=temperature[i], noise_dropout=noise_dropout,
1037
+ score_corrector=score_corrector, corrector_kwargs=corrector_kwargs)
1038
+ if mask is not None:
1039
+ assert x0 is not None
1040
+ img_orig = self.q_sample(x0, ts)
1041
+ img = img_orig * mask + (1. - mask) * img
1042
+
1043
+ if i % log_every_t == 0 or i == timesteps - 1:
1044
+ intermediates.append(x0_partial)
1045
+ if callback: callback(i)
1046
+ if img_callback: img_callback(img, i)
1047
+ return img, intermediates
1048
+
1049
+ @torch.no_grad()
1050
+ def p_sample_loop(self, cond, shape, return_intermediates=False,
1051
+ x_T=None, verbose=True, callback=None, timesteps=None, quantize_denoised=False,
1052
+ mask=None, x0=None, img_callback=None, start_T=None,
1053
+ log_every_t=None):
1054
+
1055
+ if not log_every_t:
1056
+ log_every_t = self.log_every_t
1057
+ device = self.betas.device
1058
+ b = shape[0]
1059
+ if x_T is None:
1060
+ img = torch.randn(shape, device=device)
1061
+ else:
1062
+ img = x_T
1063
+
1064
+ intermediates = [img]
1065
+ if timesteps is None:
1066
+ timesteps = self.num_timesteps
1067
+
1068
+ if start_T is not None:
1069
+ timesteps = min(timesteps, start_T)
1070
+ iterator = tqdm(reversed(range(0, timesteps)), desc='Sampling t', total=timesteps) if verbose else reversed(
1071
+ range(0, timesteps))
1072
+
1073
+ if mask is not None:
1074
+ assert x0 is not None
1075
+ assert x0.shape[2:3] == mask.shape[2:3] # spatial size has to match
1076
+
1077
+ for i in iterator:
1078
+ ts = torch.full((b,), i, device=device, dtype=torch.long)
1079
+ if self.shorten_cond_schedule:
1080
+ assert self.model.conditioning_key != 'hybrid'
1081
+ tc = self.cond_ids[ts].to(cond.device)
1082
+ cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
1083
+
1084
+ img = self.p_sample(img, cond, ts,
1085
+ clip_denoised=self.clip_denoised,
1086
+ quantize_denoised=quantize_denoised)
1087
+ if mask is not None:
1088
+ img_orig = self.q_sample(x0, ts)
1089
+ img = img_orig * mask + (1. - mask) * img
1090
+
1091
+ if i % log_every_t == 0 or i == timesteps - 1:
1092
+ intermediates.append(img)
1093
+ if callback: callback(i)
1094
+ if img_callback: img_callback(img, i)
1095
+
1096
+ if return_intermediates:
1097
+ return img, intermediates
1098
+ return img
1099
+
1100
+ @torch.no_grad()
1101
+ def sample(self, cond, batch_size=16, return_intermediates=False, x_T=None,
1102
+ verbose=True, timesteps=None, quantize_denoised=False,
1103
+ mask=None, x0=None, shape=None, **kwargs):
1104
+ if shape is None:
1105
+ shape = (batch_size, self.channels, self.image_size, self.image_size)
1106
+ if cond is not None:
1107
+ if isinstance(cond, dict):
1108
+ cond = {key: cond[key][:batch_size] if not isinstance(cond[key], list) else
1109
+ list(map(lambda x: x[:batch_size], cond[key])) for key in cond}
1110
+ else:
1111
+ cond = [c[:batch_size] for c in cond] if isinstance(cond, list) else cond[:batch_size]
1112
+ return self.p_sample_loop(cond,
1113
+ shape,
1114
+ return_intermediates=return_intermediates, x_T=x_T,
1115
+ verbose=verbose, timesteps=timesteps, quantize_denoised=quantize_denoised,
1116
+ mask=mask, x0=x0)
1117
+
1118
+ @torch.no_grad()
1119
+ def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
1120
+ if ddim:
1121
+ ddim_sampler = DDIMSampler(self)
1122
+ shape = (self.channels, self.image_size, self.image_size)
1123
+ samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size,
1124
+ shape, cond, verbose=False, **kwargs)
1125
+
1126
+ else:
1127
+ samples, intermediates = self.sample(cond=cond, batch_size=batch_size,
1128
+ return_intermediates=True, **kwargs)
1129
+
1130
+ return samples, intermediates
1131
+
1132
+ @torch.no_grad()
1133
+ def get_unconditional_conditioning(self, batch_size, null_label=None):
1134
+ if null_label is not None:
1135
+ xc = null_label
1136
+ if isinstance(xc, ListConfig):
1137
+ xc = list(xc)
1138
+ if isinstance(xc, dict) or isinstance(xc, list):
1139
+ c = self.get_learned_conditioning(xc)
1140
+ else:
1141
+ if hasattr(xc, "to"):
1142
+ xc = xc.to(self.device)
1143
+ c = self.get_learned_conditioning(xc)
1144
+ else:
1145
+ if self.cond_stage_key in ["class_label", "cls"]:
1146
+ xc = self.cond_stage_model.get_unconditional_conditioning(batch_size, device=self.device)
1147
+ return self.get_learned_conditioning(xc)
1148
+ else:
1149
+ raise NotImplementedError("todo")
1150
+ if isinstance(c, list): # in case the encoder gives us a list
1151
+ for i in range(len(c)):
1152
+ c[i] = repeat(c[i], '1 ... -> b ...', b=batch_size).to(self.device)
1153
+ else:
1154
+ c = repeat(c, '1 ... -> b ...', b=batch_size).to(self.device)
1155
+ return c
1156
+
1157
+ @torch.no_grad()
1158
+ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=50, ddim_eta=0., return_keys=None,
1159
+ quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
1160
+ plot_diffusion_rows=True, unconditional_guidance_scale=1., unconditional_guidance_label=None,
1161
+ use_ema_scope=True,
1162
+ **kwargs):
1163
+ ema_scope = self.ema_scope if use_ema_scope else nullcontext
1164
+ use_ddim = ddim_steps is not None
1165
+
1166
+ log = dict()
1167
+ z, c, x, xrec, xc = self.get_input(batch, self.first_stage_key,
1168
+ return_first_stage_outputs=True,
1169
+ force_c_encode=True,
1170
+ return_original_cond=True,
1171
+ bs=N)
1172
+ N = min(x.shape[0], N)
1173
+ n_row = min(x.shape[0], n_row)
1174
+ log["inputs"] = x
1175
+ log["reconstruction"] = xrec
1176
+ if self.model.conditioning_key is not None:
1177
+ if hasattr(self.cond_stage_model, "decode"):
1178
+ xc = self.cond_stage_model.decode(c)
1179
+ log["conditioning"] = xc
1180
+ elif self.cond_stage_key in ["caption", "txt"]:
1181
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch[self.cond_stage_key], size=x.shape[2] // 25)
1182
+ log["conditioning"] = xc
1183
+ elif self.cond_stage_key in ['class_label', "cls"]:
1184
+ try:
1185
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"], size=x.shape[2] // 25)
1186
+ log['conditioning'] = xc
1187
+ except KeyError:
1188
+ # probably no "human_label" in batch
1189
+ pass
1190
+ elif isimage(xc):
1191
+ log["conditioning"] = xc
1192
+ if ismap(xc):
1193
+ log["original_conditioning"] = self.to_rgb(xc)
1194
+
1195
+ if plot_diffusion_rows:
1196
+ # get diffusion row
1197
+ diffusion_row = list()
1198
+ z_start = z[:n_row]
1199
+ for t in range(self.num_timesteps):
1200
+ if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
1201
+ t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
1202
+ t = t.to(self.device).long()
1203
+ noise = torch.randn_like(z_start)
1204
+ z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
1205
+ diffusion_row.append(self.decode_first_stage(z_noisy))
1206
+
1207
+ diffusion_row = torch.stack(diffusion_row) # n_log_step, n_row, C, H, W
1208
+ diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
1209
+ diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
1210
+ diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
1211
+ log["diffusion_row"] = diffusion_grid
1212
+
1213
+ if sample:
1214
+ # get denoise row
1215
+ with ema_scope("Sampling"):
1216
+ samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
1217
+ ddim_steps=ddim_steps, eta=ddim_eta)
1218
+ # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
1219
+ x_samples = self.decode_first_stage(samples)
1220
+ log["samples"] = x_samples
1221
+ if plot_denoise_rows:
1222
+ denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
1223
+ log["denoise_row"] = denoise_grid
1224
+
1225
+ if quantize_denoised and not isinstance(self.first_stage_model, AutoencoderKL) and not isinstance(
1226
+ self.first_stage_model, IdentityFirstStage):
1227
+ # also display when quantizing x0 while sampling
1228
+ with ema_scope("Plotting Quantized Denoised"):
1229
+ samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
1230
+ ddim_steps=ddim_steps, eta=ddim_eta,
1231
+ quantize_denoised=True)
1232
+ # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True,
1233
+ # quantize_denoised=True)
1234
+ x_samples = self.decode_first_stage(samples.to(self.device))
1235
+ log["samples_x0_quantized"] = x_samples
1236
+
1237
+ if unconditional_guidance_scale > 1.0:
1238
+ uc = self.get_unconditional_conditioning(N, unconditional_guidance_label)
1239
+ if self.model.conditioning_key == "crossattn-adm":
1240
+ uc = {"c_crossattn": [uc], "c_adm": c["c_adm"]}
1241
+ with ema_scope("Sampling with classifier-free guidance"):
1242
+ samples_cfg, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
1243
+ ddim_steps=ddim_steps, eta=ddim_eta,
1244
+ unconditional_guidance_scale=unconditional_guidance_scale,
1245
+ unconditional_conditioning=uc,
1246
+ )
1247
+ x_samples_cfg = self.decode_first_stage(samples_cfg)
1248
+ log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
1249
+
1250
+ if inpaint:
1251
+ # make a simple center square
1252
+ b, h, w = z.shape[0], z.shape[2], z.shape[3]
1253
+ mask = torch.ones(N, h, w).to(self.device)
1254
+ # zeros will be filled in
1255
+ mask[:, h // 4:3 * h // 4, w // 4:3 * w // 4] = 0.
1256
+ mask = mask[:, None, ...]
1257
+ with ema_scope("Plotting Inpaint"):
1258
+ samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim, eta=ddim_eta,
1259
+ ddim_steps=ddim_steps, x0=z[:N], mask=mask)
1260
+ x_samples = self.decode_first_stage(samples.to(self.device))
1261
+ log["samples_inpainting"] = x_samples
1262
+ log["mask"] = mask
1263
+
1264
+ # outpaint
1265
+ mask = 1. - mask
1266
+ with ema_scope("Plotting Outpaint"):
1267
+ samples, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim, eta=ddim_eta,
1268
+ ddim_steps=ddim_steps, x0=z[:N], mask=mask)
1269
+ x_samples = self.decode_first_stage(samples.to(self.device))
1270
+ log["samples_outpainting"] = x_samples
1271
+
1272
+ if plot_progressive_rows:
1273
+ with ema_scope("Plotting Progressives"):
1274
+ img, progressives = self.progressive_denoising(c,
1275
+ shape=(self.channels, self.image_size, self.image_size),
1276
+ batch_size=N)
1277
+ prog_row = self._get_denoise_row_from_list(progressives, desc="Progressive Generation")
1278
+ log["progressive_row"] = prog_row
1279
+
1280
+ if return_keys:
1281
+ if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
1282
+ return log
1283
+ else:
1284
+ return {key: log[key] for key in return_keys}
1285
+ return log
1286
+
1287
+ def configure_optimizers(self):
1288
+ lr = self.learning_rate
1289
+ params = list(self.model.parameters())
1290
+ if self.cond_stage_trainable:
1291
+ print(f"{self.__class__.__name__}: Also optimizing conditioner params!")
1292
+ params = params + list(self.cond_stage_model.parameters())
1293
+ if self.learn_logvar:
1294
+ print('Diffusion model optimizing logvar')
1295
+ params.append(self.logvar)
1296
+ opt = torch.optim.AdamW(params, lr=lr)
1297
+ if self.use_scheduler:
1298
+ assert 'target' in self.scheduler_config
1299
+ scheduler = instantiate_from_config(self.scheduler_config)
1300
+
1301
+ print("Setting up LambdaLR scheduler...")
1302
+ scheduler = [
1303
+ {
1304
+ 'scheduler': LambdaLR(opt, lr_lambda=scheduler.schedule),
1305
+ 'interval': 'step',
1306
+ 'frequency': 1
1307
+ }]
1308
+ return [opt], scheduler
1309
+ return opt
1310
+
1311
+ @torch.no_grad()
1312
+ def to_rgb(self, x):
1313
+ x = x.float()
1314
+ if not hasattr(self, "colorize"):
1315
+ self.colorize = torch.randn(3, x.shape[1], 1, 1).to(x)
1316
+ x = nn.functional.conv2d(x, weight=self.colorize)
1317
+ x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
1318
+ return x
1319
+
1320
+
1321
+ class DiffusionWrapper(pl.LightningModule):
1322
+ def __init__(self, diff_model_config, conditioning_key):
1323
+ super().__init__()
1324
+ self.sequential_cross_attn = diff_model_config.pop("sequential_crossattn", False)
1325
+ self.diffusion_model = instantiate_from_config(diff_model_config)
1326
+ self.conditioning_key = conditioning_key
1327
+ assert self.conditioning_key in [None, 'concat', 'crossattn', 'hybrid', 'adm', 'hybrid-adm', 'crossattn-adm']
1328
+
1329
+ def forward(self, x, t, c_concat: list = None, c_crossattn: list = None, c_adm=None):
1330
+ if self.conditioning_key is None:
1331
+ out = self.diffusion_model(x, t)
1332
+ elif self.conditioning_key == 'concat':
1333
+ xc = torch.cat([x] + c_concat, dim=1)
1334
+ out = self.diffusion_model(xc, t)
1335
+ elif self.conditioning_key == 'crossattn': # default
1336
+ if not self.sequential_cross_attn:
1337
+ cc = torch.cat(c_crossattn, 1)
1338
+ else:
1339
+ cc = c_crossattn
1340
+ if hasattr(self, "scripted_diffusion_model"):
1341
+ # TorchScript changes names of the arguments
1342
+ # with argument cc defined as context=cc scripted model will produce
1343
+ # an error: RuntimeError: forward() is missing value for argument 'argument_3'.
1344
+ out = self.scripted_diffusion_model(x, t, cc)
1345
+ else:
1346
+ out = self.diffusion_model(x, t, context=cc)
1347
+ elif self.conditioning_key == 'hybrid':
1348
+ xc = torch.cat([x] + c_concat, dim=1)
1349
+ cc = torch.cat(c_crossattn, 1)
1350
+ out = self.diffusion_model(xc, t, context=cc)
1351
+ elif self.conditioning_key == 'hybrid-adm':
1352
+ assert c_adm is not None
1353
+ xc = torch.cat([x] + c_concat, dim=1)
1354
+ cc = torch.cat(c_crossattn, 1)
1355
+ out = self.diffusion_model(xc, t, context=cc, y=c_adm)
1356
+ elif self.conditioning_key == 'crossattn-adm':
1357
+ assert c_adm is not None
1358
+ cc = torch.cat(c_crossattn, 1)
1359
+ out = self.diffusion_model(x, t, context=cc, y=c_adm)
1360
+ elif self.conditioning_key == 'adm':
1361
+ cc = c_crossattn[0]
1362
+ out = self.diffusion_model(x, t, y=cc)
1363
+ else:
1364
+ raise NotImplementedError()
1365
+
1366
+ return out
1367
+
1368
+
1369
+ class LatentUpscaleDiffusion(LatentDiffusion):
1370
+ def __init__(self, *args, low_scale_config, low_scale_key="LR", noise_level_key=None, **kwargs):
1371
+ super().__init__(*args, **kwargs)
1372
+ # assumes that neither the cond_stage nor the low_scale_model contain trainable params
1373
+ assert not self.cond_stage_trainable
1374
+ self.instantiate_low_stage(low_scale_config)
1375
+ self.low_scale_key = low_scale_key
1376
+ self.noise_level_key = noise_level_key
1377
+
1378
+ def instantiate_low_stage(self, config):
1379
+ model = instantiate_from_config(config)
1380
+ self.low_scale_model = model.eval()
1381
+ self.low_scale_model.train = disabled_train
1382
+ for param in self.low_scale_model.parameters():
1383
+ param.requires_grad = False
1384
+
1385
+ @torch.no_grad()
1386
+ def get_input(self, batch, k, cond_key=None, bs=None, log_mode=False):
1387
+ if not log_mode:
1388
+ z, c = super().get_input(batch, k, force_c_encode=True, bs=bs)
1389
+ else:
1390
+ z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
1391
+ force_c_encode=True, return_original_cond=True, bs=bs)
1392
+ x_low = batch[self.low_scale_key][:bs]
1393
+ x_low = rearrange(x_low, 'b h w c -> b c h w')
1394
+ x_low = x_low.to(memory_format=torch.contiguous_format).float()
1395
+ zx, noise_level = self.low_scale_model(x_low)
1396
+ if self.noise_level_key is not None:
1397
+ # get noise level from batch instead, e.g. when extracting a custom noise level for bsr
1398
+ raise NotImplementedError('TODO')
1399
+
1400
+ all_conds = {"c_concat": [zx], "c_crossattn": [c], "c_adm": noise_level}
1401
+ if log_mode:
1402
+ # TODO: maybe disable if too expensive
1403
+ x_low_rec = self.low_scale_model.decode(zx)
1404
+ return z, all_conds, x, xrec, xc, x_low, x_low_rec, noise_level
1405
+ return z, all_conds
1406
+
1407
+ @torch.no_grad()
1408
+ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
1409
+ plot_denoise_rows=False, plot_progressive_rows=True, plot_diffusion_rows=True,
1410
+ unconditional_guidance_scale=1., unconditional_guidance_label=None, use_ema_scope=True,
1411
+ **kwargs):
1412
+ ema_scope = self.ema_scope if use_ema_scope else nullcontext
1413
+ use_ddim = ddim_steps is not None
1414
+
1415
+ log = dict()
1416
+ z, c, x, xrec, xc, x_low, x_low_rec, noise_level = self.get_input(batch, self.first_stage_key, bs=N,
1417
+ log_mode=True)
1418
+ N = min(x.shape[0], N)
1419
+ n_row = min(x.shape[0], n_row)
1420
+ log["inputs"] = x
1421
+ log["reconstruction"] = xrec
1422
+ log["x_lr"] = x_low
1423
+ log[f"x_lr_rec_@noise_levels{'-'.join(map(lambda x: str(x), list(noise_level.cpu().numpy())))}"] = x_low_rec
1424
+ if self.model.conditioning_key is not None:
1425
+ if hasattr(self.cond_stage_model, "decode"):
1426
+ xc = self.cond_stage_model.decode(c)
1427
+ log["conditioning"] = xc
1428
+ elif self.cond_stage_key in ["caption", "txt"]:
1429
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch[self.cond_stage_key], size=x.shape[2] // 25)
1430
+ log["conditioning"] = xc
1431
+ elif self.cond_stage_key in ['class_label', 'cls']:
1432
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"], size=x.shape[2] // 25)
1433
+ log['conditioning'] = xc
1434
+ elif isimage(xc):
1435
+ log["conditioning"] = xc
1436
+ if ismap(xc):
1437
+ log["original_conditioning"] = self.to_rgb(xc)
1438
+
1439
+ if plot_diffusion_rows:
1440
+ # get diffusion row
1441
+ diffusion_row = list()
1442
+ z_start = z[:n_row]
1443
+ for t in range(self.num_timesteps):
1444
+ if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
1445
+ t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
1446
+ t = t.to(self.device).long()
1447
+ noise = torch.randn_like(z_start)
1448
+ z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
1449
+ diffusion_row.append(self.decode_first_stage(z_noisy))
1450
+
1451
+ diffusion_row = torch.stack(diffusion_row) # n_log_step, n_row, C, H, W
1452
+ diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
1453
+ diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
1454
+ diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
1455
+ log["diffusion_row"] = diffusion_grid
1456
+
1457
+ if sample:
1458
+ # get denoise row
1459
+ with ema_scope("Sampling"):
1460
+ samples, z_denoise_row = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
1461
+ ddim_steps=ddim_steps, eta=ddim_eta)
1462
+ # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
1463
+ x_samples = self.decode_first_stage(samples)
1464
+ log["samples"] = x_samples
1465
+ if plot_denoise_rows:
1466
+ denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
1467
+ log["denoise_row"] = denoise_grid
1468
+
1469
+ if unconditional_guidance_scale > 1.0:
1470
+ uc_tmp = self.get_unconditional_conditioning(N, unconditional_guidance_label)
1471
+ # TODO explore better "unconditional" choices for the other keys
1472
+ # maybe guide away from empty text label and highest noise level and maximally degraded zx?
1473
+ uc = dict()
1474
+ for k in c:
1475
+ if k == "c_crossattn":
1476
+ assert isinstance(c[k], list) and len(c[k]) == 1
1477
+ uc[k] = [uc_tmp]
1478
+ elif k == "c_adm": # todo: only run with text-based guidance?
1479
+ assert isinstance(c[k], torch.Tensor)
1480
+ #uc[k] = torch.ones_like(c[k]) * self.low_scale_model.max_noise_level
1481
+ uc[k] = c[k]
1482
+ elif isinstance(c[k], list):
1483
+ uc[k] = [c[k][i] for i in range(len(c[k]))]
1484
+ else:
1485
+ uc[k] = c[k]
1486
+
1487
+ with ema_scope("Sampling with classifier-free guidance"):
1488
+ samples_cfg, _ = self.sample_log(cond=c, batch_size=N, ddim=use_ddim,
1489
+ ddim_steps=ddim_steps, eta=ddim_eta,
1490
+ unconditional_guidance_scale=unconditional_guidance_scale,
1491
+ unconditional_conditioning=uc,
1492
+ )
1493
+ x_samples_cfg = self.decode_first_stage(samples_cfg)
1494
+ log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
1495
+
1496
+ if plot_progressive_rows:
1497
+ with ema_scope("Plotting Progressives"):
1498
+ img, progressives = self.progressive_denoising(c,
1499
+ shape=(self.channels, self.image_size, self.image_size),
1500
+ batch_size=N)
1501
+ prog_row = self._get_denoise_row_from_list(progressives, desc="Progressive Generation")
1502
+ log["progressive_row"] = prog_row
1503
+
1504
+ return log
1505
+
1506
+
1507
+ class LatentFinetuneDiffusion(LatentDiffusion):
1508
+ """
1509
+ Basis for different finetunas, such as inpainting or depth2image
1510
+ To disable finetuning mode, set finetune_keys to None
1511
+ """
1512
+
1513
+ def __init__(self,
1514
+ concat_keys: tuple,
1515
+ finetune_keys=("model.diffusion_model.input_blocks.0.0.weight",
1516
+ "model_ema.diffusion_modelinput_blocks00weight"
1517
+ ),
1518
+ keep_finetune_dims=4,
1519
+ # if model was trained without concat mode before and we would like to keep these channels
1520
+ c_concat_log_start=None, # to log reconstruction of c_concat codes
1521
+ c_concat_log_end=None,
1522
+ *args, **kwargs
1523
+ ):
1524
+ ckpt_path = kwargs.pop("ckpt_path", None)
1525
+ ignore_keys = kwargs.pop("ignore_keys", list())
1526
+ super().__init__(*args, **kwargs)
1527
+ self.finetune_keys = finetune_keys
1528
+ self.concat_keys = concat_keys
1529
+ self.keep_dims = keep_finetune_dims
1530
+ self.c_concat_log_start = c_concat_log_start
1531
+ self.c_concat_log_end = c_concat_log_end
1532
+ if exists(self.finetune_keys): assert exists(ckpt_path), 'can only finetune from a given checkpoint'
1533
+ if exists(ckpt_path):
1534
+ self.init_from_ckpt(ckpt_path, ignore_keys)
1535
+
1536
+ def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
1537
+ sd = torch.load(path, map_location="cpu")
1538
+ if "state_dict" in list(sd.keys()):
1539
+ sd = sd["state_dict"]
1540
+ keys = list(sd.keys())
1541
+ for k in keys:
1542
+ for ik in ignore_keys:
1543
+ if k.startswith(ik):
1544
+ print("Deleting key {} from state_dict.".format(k))
1545
+ del sd[k]
1546
+
1547
+ # make it explicit, finetune by including extra input channels
1548
+ if exists(self.finetune_keys) and k in self.finetune_keys:
1549
+ new_entry = None
1550
+ for name, param in self.named_parameters():
1551
+ if name in self.finetune_keys:
1552
+ print(
1553
+ f"modifying key '{name}' and keeping its original {self.keep_dims} (channels) dimensions only")
1554
+ new_entry = torch.zeros_like(param) # zero init
1555
+ assert exists(new_entry), 'did not find matching parameter to modify'
1556
+ new_entry[:, :self.keep_dims, ...] = sd[k]
1557
+ sd[k] = new_entry
1558
+
1559
+ missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
1560
+ sd, strict=False)
1561
+ print(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
1562
+ if len(missing) > 0:
1563
+ print(f"Missing Keys: {missing}")
1564
+ if len(unexpected) > 0:
1565
+ print(f"Unexpected Keys: {unexpected}")
1566
+
1567
+ @torch.no_grad()
1568
+ def log_images(self, batch, N=8, n_row=4, sample=True, ddim_steps=200, ddim_eta=1., return_keys=None,
1569
+ quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True,
1570
+ plot_diffusion_rows=True, unconditional_guidance_scale=1., unconditional_guidance_label=None,
1571
+ use_ema_scope=True,
1572
+ **kwargs):
1573
+ ema_scope = self.ema_scope if use_ema_scope else nullcontext
1574
+ use_ddim = ddim_steps is not None
1575
+
1576
+ log = dict()
1577
+ z, c, x, xrec, xc = self.get_input(batch, self.first_stage_key, bs=N, return_first_stage_outputs=True)
1578
+ c_cat, c = c["c_concat"][0], c["c_crossattn"][0]
1579
+ N = min(x.shape[0], N)
1580
+ n_row = min(x.shape[0], n_row)
1581
+ log["inputs"] = x
1582
+ log["reconstruction"] = xrec
1583
+ if self.model.conditioning_key is not None:
1584
+ if hasattr(self.cond_stage_model, "decode"):
1585
+ xc = self.cond_stage_model.decode(c)
1586
+ log["conditioning"] = xc
1587
+ elif self.cond_stage_key in ["caption", "txt"]:
1588
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch[self.cond_stage_key], size=x.shape[2] // 25)
1589
+ log["conditioning"] = xc
1590
+ elif self.cond_stage_key in ['class_label', 'cls']:
1591
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch["human_label"], size=x.shape[2] // 25)
1592
+ log['conditioning'] = xc
1593
+ elif isimage(xc):
1594
+ log["conditioning"] = xc
1595
+ if ismap(xc):
1596
+ log["original_conditioning"] = self.to_rgb(xc)
1597
+
1598
+ if not (self.c_concat_log_start is None and self.c_concat_log_end is None):
1599
+ log["c_concat_decoded"] = self.decode_first_stage(c_cat[:, self.c_concat_log_start:self.c_concat_log_end])
1600
+
1601
+ if plot_diffusion_rows:
1602
+ # get diffusion row
1603
+ diffusion_row = list()
1604
+ z_start = z[:n_row]
1605
+ for t in range(self.num_timesteps):
1606
+ if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
1607
+ t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
1608
+ t = t.to(self.device).long()
1609
+ noise = torch.randn_like(z_start)
1610
+ z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
1611
+ diffusion_row.append(self.decode_first_stage(z_noisy))
1612
+
1613
+ diffusion_row = torch.stack(diffusion_row) # n_log_step, n_row, C, H, W
1614
+ diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
1615
+ diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w')
1616
+ diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0])
1617
+ log["diffusion_row"] = diffusion_grid
1618
+
1619
+ if sample:
1620
+ # get denoise row
1621
+ with ema_scope("Sampling"):
1622
+ samples, z_denoise_row = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
1623
+ batch_size=N, ddim=use_ddim,
1624
+ ddim_steps=ddim_steps, eta=ddim_eta)
1625
+ # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
1626
+ x_samples = self.decode_first_stage(samples)
1627
+ log["samples"] = x_samples
1628
+ if plot_denoise_rows:
1629
+ denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
1630
+ log["denoise_row"] = denoise_grid
1631
+
1632
+ if unconditional_guidance_scale > 1.0:
1633
+ uc_cross = self.get_unconditional_conditioning(N, unconditional_guidance_label)
1634
+ uc_cat = c_cat
1635
+ uc_full = {"c_concat": [uc_cat], "c_crossattn": [uc_cross]}
1636
+ with ema_scope("Sampling with classifier-free guidance"):
1637
+ samples_cfg, _ = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c]},
1638
+ batch_size=N, ddim=use_ddim,
1639
+ ddim_steps=ddim_steps, eta=ddim_eta,
1640
+ unconditional_guidance_scale=unconditional_guidance_scale,
1641
+ unconditional_conditioning=uc_full,
1642
+ )
1643
+ x_samples_cfg = self.decode_first_stage(samples_cfg)
1644
+ log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
1645
+
1646
+ return log
1647
+
1648
+
1649
+ class LatentInpaintDiffusion(LatentFinetuneDiffusion):
1650
+ """
1651
+ can either run as pure inpainting model (only concat mode) or with mixed conditionings,
1652
+ e.g. mask as concat and text via cross-attn.
1653
+ To disable finetuning mode, set finetune_keys to None
1654
+ """
1655
+
1656
+ def __init__(self,
1657
+ concat_keys=("mask", "masked_image"),
1658
+ masked_image_key="masked_image",
1659
+ *args, **kwargs
1660
+ ):
1661
+ super().__init__(concat_keys, *args, **kwargs)
1662
+ self.masked_image_key = masked_image_key
1663
+ assert self.masked_image_key in concat_keys
1664
+
1665
+ @torch.no_grad()
1666
+ def get_input(self, batch, k, cond_key=None, bs=None, return_first_stage_outputs=False):
1667
+ # note: restricted to non-trainable encoders currently
1668
+ assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for inpainting'
1669
+ z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
1670
+ force_c_encode=True, return_original_cond=True, bs=bs)
1671
+
1672
+ assert exists(self.concat_keys)
1673
+ c_cat = list()
1674
+ for ck in self.concat_keys:
1675
+ cc = rearrange(batch[ck], 'b h w c -> b c h w').to(memory_format=torch.contiguous_format).float()
1676
+ if bs is not None:
1677
+ cc = cc[:bs]
1678
+ cc = cc.to(self.device)
1679
+ bchw = z.shape
1680
+ if ck != self.masked_image_key:
1681
+ cc = torch.nn.functional.interpolate(cc, size=bchw[-2:])
1682
+ else:
1683
+ cc = self.get_first_stage_encoding(self.encode_first_stage(cc))
1684
+ c_cat.append(cc)
1685
+ c_cat = torch.cat(c_cat, dim=1)
1686
+ all_conds = {"c_concat": [c_cat], "c_crossattn": [c]}
1687
+ if return_first_stage_outputs:
1688
+ return z, all_conds, x, xrec, xc
1689
+ return z, all_conds
1690
+
1691
+ @torch.no_grad()
1692
+ def log_images(self, *args, **kwargs):
1693
+ log = super(LatentInpaintDiffusion, self).log_images(*args, **kwargs)
1694
+ log["masked_image"] = rearrange(args[0]["masked_image"],
1695
+ 'b h w c -> b c h w').to(memory_format=torch.contiguous_format).float()
1696
+ return log
1697
+
1698
+
1699
+ class LatentDepth2ImageDiffusion(LatentFinetuneDiffusion):
1700
+ """
1701
+ condition on monocular depth estimation
1702
+ """
1703
+
1704
+ def __init__(self, depth_stage_config, concat_keys=("midas_in",), *args, **kwargs):
1705
+ super().__init__(concat_keys=concat_keys, *args, **kwargs)
1706
+ self.depth_model = instantiate_from_config(depth_stage_config)
1707
+ self.depth_stage_key = concat_keys[0]
1708
+
1709
+ @torch.no_grad()
1710
+ def get_input(self, batch, k, cond_key=None, bs=None, return_first_stage_outputs=False):
1711
+ # note: restricted to non-trainable encoders currently
1712
+ assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for depth2img'
1713
+ z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
1714
+ force_c_encode=True, return_original_cond=True, bs=bs)
1715
+
1716
+ assert exists(self.concat_keys)
1717
+ assert len(self.concat_keys) == 1
1718
+ c_cat = list()
1719
+ for ck in self.concat_keys:
1720
+ cc = batch[ck]
1721
+ if bs is not None:
1722
+ cc = cc[:bs]
1723
+ cc = cc.to(self.device)
1724
+ cc = self.depth_model(cc)
1725
+ cc = torch.nn.functional.interpolate(
1726
+ cc,
1727
+ size=z.shape[2:],
1728
+ mode="bicubic",
1729
+ align_corners=False,
1730
+ )
1731
+
1732
+ depth_min, depth_max = torch.amin(cc, dim=[1, 2, 3], keepdim=True), torch.amax(cc, dim=[1, 2, 3],
1733
+ keepdim=True)
1734
+ cc = 2. * (cc - depth_min) / (depth_max - depth_min + 0.001) - 1.
1735
+ c_cat.append(cc)
1736
+ c_cat = torch.cat(c_cat, dim=1)
1737
+ all_conds = {"c_concat": [c_cat], "c_crossattn": [c]}
1738
+ if return_first_stage_outputs:
1739
+ return z, all_conds, x, xrec, xc
1740
+ return z, all_conds
1741
+
1742
+ @torch.no_grad()
1743
+ def log_images(self, *args, **kwargs):
1744
+ log = super().log_images(*args, **kwargs)
1745
+ depth = self.depth_model(args[0][self.depth_stage_key])
1746
+ depth_min, depth_max = torch.amin(depth, dim=[1, 2, 3], keepdim=True), \
1747
+ torch.amax(depth, dim=[1, 2, 3], keepdim=True)
1748
+ log["depth"] = 2. * (depth - depth_min) / (depth_max - depth_min) - 1.
1749
+ return log
1750
+
1751
+
1752
+ class LatentUpscaleFinetuneDiffusion(LatentFinetuneDiffusion):
1753
+ """
1754
+ condition on low-res image (and optionally on some spatial noise augmentation)
1755
+ """
1756
+ def __init__(self, concat_keys=("lr",), reshuffle_patch_size=None,
1757
+ low_scale_config=None, low_scale_key=None, *args, **kwargs):
1758
+ super().__init__(concat_keys=concat_keys, *args, **kwargs)
1759
+ self.reshuffle_patch_size = reshuffle_patch_size
1760
+ self.low_scale_model = None
1761
+ if low_scale_config is not None:
1762
+ print("Initializing a low-scale model")
1763
+ assert exists(low_scale_key)
1764
+ self.instantiate_low_stage(low_scale_config)
1765
+ self.low_scale_key = low_scale_key
1766
+
1767
+ def instantiate_low_stage(self, config):
1768
+ model = instantiate_from_config(config)
1769
+ self.low_scale_model = model.eval()
1770
+ self.low_scale_model.train = disabled_train
1771
+ for param in self.low_scale_model.parameters():
1772
+ param.requires_grad = False
1773
+
1774
+ @torch.no_grad()
1775
+ def get_input(self, batch, k, cond_key=None, bs=None, return_first_stage_outputs=False):
1776
+ # note: restricted to non-trainable encoders currently
1777
+ assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for upscaling-ft'
1778
+ z, c, x, xrec, xc = super().get_input(batch, self.first_stage_key, return_first_stage_outputs=True,
1779
+ force_c_encode=True, return_original_cond=True, bs=bs)
1780
+
1781
+ assert exists(self.concat_keys)
1782
+ assert len(self.concat_keys) == 1
1783
+ # optionally make spatial noise_level here
1784
+ c_cat = list()
1785
+ noise_level = None
1786
+ for ck in self.concat_keys:
1787
+ cc = batch[ck]
1788
+ cc = rearrange(cc, 'b h w c -> b c h w')
1789
+ if exists(self.reshuffle_patch_size):
1790
+ assert isinstance(self.reshuffle_patch_size, int)
1791
+ cc = rearrange(cc, 'b c (p1 h) (p2 w) -> b (p1 p2 c) h w',
1792
+ p1=self.reshuffle_patch_size, p2=self.reshuffle_patch_size)
1793
+ if bs is not None:
1794
+ cc = cc[:bs]
1795
+ cc = cc.to(self.device)
1796
+ if exists(self.low_scale_model) and ck == self.low_scale_key:
1797
+ cc, noise_level = self.low_scale_model(cc)
1798
+ c_cat.append(cc)
1799
+ c_cat = torch.cat(c_cat, dim=1)
1800
+ if exists(noise_level):
1801
+ all_conds = {"c_concat": [c_cat], "c_crossattn": [c], "c_adm": noise_level}
1802
+ else:
1803
+ all_conds = {"c_concat": [c_cat], "c_crossattn": [c]}
1804
+ if return_first_stage_outputs:
1805
+ return z, all_conds, x, xrec, xc
1806
+ return z, all_conds
1807
+
1808
+ @torch.no_grad()
1809
+ def log_images(self, *args, **kwargs):
1810
+ log = super().log_images(*args, **kwargs)
1811
+ log["lr"] = rearrange(args[0]["lr"], 'b h w c -> b c h w')
1812
+ return log
1813
+
1814
+
1815
+ class ImageEmbeddingConditionedLatentDiffusion(LatentDiffusion):
1816
+ def __init__(self, embedder_config, embedding_key="jpg", embedding_dropout=0.5,
1817
+ freeze_embedder=True, noise_aug_config=None, *args, **kwargs):
1818
+ super().__init__(*args, **kwargs)
1819
+ self.embed_key = embedding_key
1820
+ self.embedding_dropout = embedding_dropout
1821
+ self._init_embedder(embedder_config, freeze_embedder)
1822
+ self._init_noise_aug(noise_aug_config)
1823
+
1824
+ def _init_embedder(self, config, freeze=True):
1825
+ embedder = instantiate_from_config(config)
1826
+ if freeze:
1827
+ self.embedder = embedder.eval()
1828
+ self.embedder.train = disabled_train
1829
+ for param in self.embedder.parameters():
1830
+ param.requires_grad = False
1831
+
1832
+ def _init_noise_aug(self, config):
1833
+ if config is not None:
1834
+ # use the KARLO schedule for noise augmentation on CLIP image embeddings
1835
+ noise_augmentor = instantiate_from_config(config)
1836
+ assert isinstance(noise_augmentor, nn.Module)
1837
+ noise_augmentor = noise_augmentor.eval()
1838
+ noise_augmentor.train = disabled_train
1839
+ self.noise_augmentor = noise_augmentor
1840
+ else:
1841
+ self.noise_augmentor = None
1842
+
1843
+ def get_input(self, batch, k, cond_key=None, bs=None, **kwargs):
1844
+ outputs = LatentDiffusion.get_input(self, batch, k, bs=bs, **kwargs)
1845
+ z, c = outputs[0], outputs[1]
1846
+ img = batch[self.embed_key][:bs]
1847
+ img = rearrange(img, 'b h w c -> b c h w')
1848
+ c_adm = self.embedder(img)
1849
+ if self.noise_augmentor is not None:
1850
+ c_adm, noise_level_emb = self.noise_augmentor(c_adm)
1851
+ # assume this gives embeddings of noise levels
1852
+ c_adm = torch.cat((c_adm, noise_level_emb), 1)
1853
+ if self.training:
1854
+ c_adm = torch.bernoulli((1. - self.embedding_dropout) * torch.ones(c_adm.shape[0],
1855
+ device=c_adm.device)[:, None]) * c_adm
1856
+ all_conds = {"c_crossattn": [c], "c_adm": c_adm}
1857
+ noutputs = [z, all_conds]
1858
+ noutputs.extend(outputs[2:])
1859
+ return noutputs
1860
+
1861
+ @torch.no_grad()
1862
+ def log_images(self, batch, N=8, n_row=4, **kwargs):
1863
+ log = dict()
1864
+ z, c, x, xrec, xc = self.get_input(batch, self.first_stage_key, bs=N, return_first_stage_outputs=True,
1865
+ return_original_cond=True)
1866
+ log["inputs"] = x
1867
+ log["reconstruction"] = xrec
1868
+ assert self.model.conditioning_key is not None
1869
+ assert self.cond_stage_key in ["caption", "txt"]
1870
+ xc = log_txt_as_img((x.shape[2], x.shape[3]), batch[self.cond_stage_key], size=x.shape[2] // 25)
1871
+ log["conditioning"] = xc
1872
+ uc = self.get_unconditional_conditioning(N, kwargs.get('unconditional_guidance_label', ''))
1873
+ unconditional_guidance_scale = kwargs.get('unconditional_guidance_scale', 5.)
1874
+
1875
+ uc_ = {"c_crossattn": [uc], "c_adm": c["c_adm"]}
1876
+ ema_scope = self.ema_scope if kwargs.get('use_ema_scope', True) else nullcontext
1877
+ with ema_scope(f"Sampling"):
1878
+ samples_cfg, _ = self.sample_log(cond=c, batch_size=N, ddim=True,
1879
+ ddim_steps=kwargs.get('ddim_steps', 50), eta=kwargs.get('ddim_eta', 0.),
1880
+ unconditional_guidance_scale=unconditional_guidance_scale,
1881
+ unconditional_conditioning=uc_, )
1882
+ x_samples_cfg = self.decode_first_stage(samples_cfg)
1883
+ log[f"samplescfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg
1884
+ return log
ldm/models/diffusion/dpm_solver/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .sampler import DPMSolverSampler
ldm/models/diffusion/dpm_solver/dpm_solver.py ADDED
@@ -0,0 +1,1163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ import math
4
+ from tqdm import tqdm
5
+
6
+
7
+ class NoiseScheduleVP:
8
+ def __init__(
9
+ self,
10
+ schedule='discrete',
11
+ betas=None,
12
+ alphas_cumprod=None,
13
+ continuous_beta_0=0.1,
14
+ continuous_beta_1=20.,
15
+ ):
16
+ """Create a wrapper class for the forward SDE (VP type).
17
+ ***
18
+ Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
19
+ We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images.
20
+ ***
21
+ The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
22
+ We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
23
+ Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
24
+ log_alpha_t = self.marginal_log_mean_coeff(t)
25
+ sigma_t = self.marginal_std(t)
26
+ lambda_t = self.marginal_lambda(t)
27
+ Moreover, as lambda(t) is an invertible function, we also support its inverse function:
28
+ t = self.inverse_lambda(lambda_t)
29
+ ===============================================================
30
+ We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).
31
+ 1. For discrete-time DPMs:
32
+ For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
33
+ t_i = (i + 1) / N
34
+ e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
35
+ We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.
36
+ Args:
37
+ betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
38
+ alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
39
+ Note that we always have alphas_cumprod = cumprod(betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
40
+ **Important**: Please pay special attention for the args for `alphas_cumprod`:
41
+ The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
42
+ q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
43
+ Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
44
+ alpha_{t_n} = \sqrt{\hat{alpha_n}},
45
+ and
46
+ log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).
47
+ 2. For continuous-time DPMs:
48
+ We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise
49
+ schedule are the default settings in DDPM and improved-DDPM:
50
+ Args:
51
+ beta_min: A `float` number. The smallest beta for the linear schedule.
52
+ beta_max: A `float` number. The largest beta for the linear schedule.
53
+ cosine_s: A `float` number. The hyperparameter in the cosine schedule.
54
+ cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule.
55
+ T: A `float` number. The ending time of the forward process.
56
+ ===============================================================
57
+ Args:
58
+ schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
59
+ 'linear' or 'cosine' for continuous-time DPMs.
60
+ Returns:
61
+ A wrapper object of the forward SDE (VP type).
62
+
63
+ ===============================================================
64
+ Example:
65
+ # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
66
+ >>> ns = NoiseScheduleVP('discrete', betas=betas)
67
+ # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
68
+ >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)
69
+ # For continuous-time DPMs (VPSDE), linear schedule:
70
+ >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)
71
+ """
72
+
73
+ if schedule not in ['discrete', 'linear', 'cosine']:
74
+ raise ValueError(
75
+ "Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format(
76
+ schedule))
77
+
78
+ self.schedule = schedule
79
+ if schedule == 'discrete':
80
+ if betas is not None:
81
+ log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
82
+ else:
83
+ assert alphas_cumprod is not None
84
+ log_alphas = 0.5 * torch.log(alphas_cumprod)
85
+ self.total_N = len(log_alphas)
86
+ self.T = 1.
87
+ self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1))
88
+ self.log_alpha_array = log_alphas.reshape((1, -1,))
89
+ else:
90
+ self.total_N = 1000
91
+ self.beta_0 = continuous_beta_0
92
+ self.beta_1 = continuous_beta_1
93
+ self.cosine_s = 0.008
94
+ self.cosine_beta_max = 999.
95
+ self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * (
96
+ 1. + self.cosine_s) / math.pi - self.cosine_s
97
+ self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.))
98
+ self.schedule = schedule
99
+ if schedule == 'cosine':
100
+ # For the cosine schedule, T = 1 will have numerical issues. So we manually set the ending time T.
101
+ # Note that T = 0.9946 may be not the optimal setting. However, we find it works well.
102
+ self.T = 0.9946
103
+ else:
104
+ self.T = 1.
105
+
106
+ def marginal_log_mean_coeff(self, t):
107
+ """
108
+ Compute log(alpha_t) of a given continuous-time label t in [0, T].
109
+ """
110
+ if self.schedule == 'discrete':
111
+ return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device),
112
+ self.log_alpha_array.to(t.device)).reshape((-1))
113
+ elif self.schedule == 'linear':
114
+ return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
115
+ elif self.schedule == 'cosine':
116
+ log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.))
117
+ log_alpha_t = log_alpha_fn(t) - self.cosine_log_alpha_0
118
+ return log_alpha_t
119
+
120
+ def marginal_alpha(self, t):
121
+ """
122
+ Compute alpha_t of a given continuous-time label t in [0, T].
123
+ """
124
+ return torch.exp(self.marginal_log_mean_coeff(t))
125
+
126
+ def marginal_std(self, t):
127
+ """
128
+ Compute sigma_t of a given continuous-time label t in [0, T].
129
+ """
130
+ return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))
131
+
132
+ def marginal_lambda(self, t):
133
+ """
134
+ Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
135
+ """
136
+ log_mean_coeff = self.marginal_log_mean_coeff(t)
137
+ log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
138
+ return log_mean_coeff - log_std
139
+
140
+ def inverse_lambda(self, lamb):
141
+ """
142
+ Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
143
+ """
144
+ if self.schedule == 'linear':
145
+ tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
146
+ Delta = self.beta_0 ** 2 + tmp
147
+ return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
148
+ elif self.schedule == 'discrete':
149
+ log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
150
+ t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]),
151
+ torch.flip(self.t_array.to(lamb.device), [1]))
152
+ return t.reshape((-1,))
153
+ else:
154
+ log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
155
+ t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (
156
+ 1. + self.cosine_s) / math.pi - self.cosine_s
157
+ t = t_fn(log_alpha)
158
+ return t
159
+
160
+
161
+ def model_wrapper(
162
+ model,
163
+ noise_schedule,
164
+ model_type="noise",
165
+ model_kwargs={},
166
+ guidance_type="uncond",
167
+ condition=None,
168
+ unconditional_condition=None,
169
+ guidance_scale=1.,
170
+ classifier_fn=None,
171
+ classifier_kwargs={},
172
+ ):
173
+ """Create a wrapper function for the noise prediction model.
174
+ DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to
175
+ firstly wrap the model function to a noise prediction model that accepts the continuous time as the input.
176
+ We support four types of the diffusion model by setting `model_type`:
177
+ 1. "noise": noise prediction model. (Trained by predicting noise).
178
+ 2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0).
179
+ 3. "v": velocity prediction model. (Trained by predicting the velocity).
180
+ The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2].
181
+ [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models."
182
+ arXiv preprint arXiv:2202.00512 (2022).
183
+ [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
184
+ arXiv preprint arXiv:2210.02303 (2022).
185
+
186
+ 4. "score": marginal score function. (Trained by denoising score matching).
187
+ Note that the score function and the noise prediction model follows a simple relationship:
188
+ ```
189
+ noise(x_t, t) = -sigma_t * score(x_t, t)
190
+ ```
191
+ We support three types of guided sampling by DPMs by setting `guidance_type`:
192
+ 1. "uncond": unconditional sampling by DPMs.
193
+ The input `model` has the following format:
194
+ ``
195
+ model(x, t_input, **model_kwargs) -> noise | x_start | v | score
196
+ ``
197
+ 2. "classifier": classifier guidance sampling [3] by DPMs and another classifier.
198
+ The input `model` has the following format:
199
+ ``
200
+ model(x, t_input, **model_kwargs) -> noise | x_start | v | score
201
+ ``
202
+ The input `classifier_fn` has the following format:
203
+ ``
204
+ classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond)
205
+ ``
206
+ [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis,"
207
+ in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794.
208
+ 3. "classifier-free": classifier-free guidance sampling by conditional DPMs.
209
+ The input `model` has the following format:
210
+ ``
211
+ model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
212
+ ``
213
+ And if cond == `unconditional_condition`, the model output is the unconditional DPM output.
214
+ [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
215
+ arXiv preprint arXiv:2207.12598 (2022).
216
+
217
+ The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
218
+ or continuous-time labels (i.e. epsilon to T).
219
+ We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise:
220
+ ``
221
+ def model_fn(x, t_continuous) -> noise:
222
+ t_input = get_model_input_time(t_continuous)
223
+ return noise_pred(model, x, t_input, **model_kwargs)
224
+ ``
225
+ where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver.
226
+ ===============================================================
227
+ Args:
228
+ model: A diffusion model with the corresponding format described above.
229
+ noise_schedule: A noise schedule object, such as NoiseScheduleVP.
230
+ model_type: A `str`. The parameterization type of the diffusion model.
231
+ "noise" or "x_start" or "v" or "score".
232
+ model_kwargs: A `dict`. A dict for the other inputs of the model function.
233
+ guidance_type: A `str`. The type of the guidance for sampling.
234
+ "uncond" or "classifier" or "classifier-free".
235
+ condition: A pytorch tensor. The condition for the guided sampling.
236
+ Only used for "classifier" or "classifier-free" guidance type.
237
+ unconditional_condition: A pytorch tensor. The condition for the unconditional sampling.
238
+ Only used for "classifier-free" guidance type.
239
+ guidance_scale: A `float`. The scale for the guided sampling.
240
+ classifier_fn: A classifier function. Only used for the classifier guidance.
241
+ classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function.
242
+ Returns:
243
+ A noise prediction model that accepts the noised data and the continuous time as the inputs.
244
+ """
245
+
246
+ def get_model_input_time(t_continuous):
247
+ """
248
+ Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
249
+ For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
250
+ For continuous-time DPMs, we just use `t_continuous`.
251
+ """
252
+ if noise_schedule.schedule == 'discrete':
253
+ return (t_continuous - 1. / noise_schedule.total_N) * 1000.
254
+ else:
255
+ return t_continuous
256
+
257
+ def noise_pred_fn(x, t_continuous, cond=None):
258
+ if t_continuous.reshape((-1,)).shape[0] == 1:
259
+ t_continuous = t_continuous.expand((x.shape[0]))
260
+ t_input = get_model_input_time(t_continuous)
261
+ if cond is None:
262
+ output = model(x, t_input, **model_kwargs)
263
+ else:
264
+ output = model(x, t_input, cond, **model_kwargs)
265
+ if model_type == "noise":
266
+ return output
267
+ elif model_type == "x_start":
268
+ alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
269
+ dims = x.dim()
270
+ return (x - expand_dims(alpha_t, dims) * output) / expand_dims(sigma_t, dims)
271
+ elif model_type == "v":
272
+ alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
273
+ dims = x.dim()
274
+ return expand_dims(alpha_t, dims) * output + expand_dims(sigma_t, dims) * x
275
+ elif model_type == "score":
276
+ sigma_t = noise_schedule.marginal_std(t_continuous)
277
+ dims = x.dim()
278
+ return -expand_dims(sigma_t, dims) * output
279
+
280
+ def cond_grad_fn(x, t_input):
281
+ """
282
+ Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
283
+ """
284
+ with torch.enable_grad():
285
+ x_in = x.detach().requires_grad_(True)
286
+ log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
287
+ return torch.autograd.grad(log_prob.sum(), x_in)[0]
288
+
289
+ def model_fn(x, t_continuous):
290
+ """
291
+ The noise predicition model function that is used for DPM-Solver.
292
+ """
293
+ if t_continuous.reshape((-1,)).shape[0] == 1:
294
+ t_continuous = t_continuous.expand((x.shape[0]))
295
+ if guidance_type == "uncond":
296
+ return noise_pred_fn(x, t_continuous)
297
+ elif guidance_type == "classifier":
298
+ assert classifier_fn is not None
299
+ t_input = get_model_input_time(t_continuous)
300
+ cond_grad = cond_grad_fn(x, t_input)
301
+ sigma_t = noise_schedule.marginal_std(t_continuous)
302
+ noise = noise_pred_fn(x, t_continuous)
303
+ return noise - guidance_scale * expand_dims(sigma_t, dims=cond_grad.dim()) * cond_grad
304
+ elif guidance_type == "classifier-free":
305
+ if guidance_scale == 1. or unconditional_condition is None:
306
+ return noise_pred_fn(x, t_continuous, cond=condition)
307
+ else:
308
+ x_in = torch.cat([x] * 2)
309
+ t_in = torch.cat([t_continuous] * 2)
310
+ if isinstance(condition, dict):
311
+ assert isinstance(unconditional_condition, dict)
312
+ c_in = dict()
313
+ for k in condition:
314
+ if isinstance(condition[k], list):
315
+ c_in[k] = [torch.cat([unconditional_condition[k][i], condition[k][i]]) for i in range(len(condition[k]))]
316
+ else:
317
+ c_in[k] = torch.cat([unconditional_condition[k], condition[k]])
318
+ else:
319
+ c_in = torch.cat([unconditional_condition, condition])
320
+ noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
321
+ return noise_uncond + guidance_scale * (noise - noise_uncond)
322
+
323
+ assert model_type in ["noise", "x_start", "v"]
324
+ assert guidance_type in ["uncond", "classifier", "classifier-free"]
325
+ return model_fn
326
+
327
+
328
+ class DPM_Solver:
329
+ def __init__(self, model_fn, noise_schedule, predict_x0=False, thresholding=False, max_val=1.):
330
+ """Construct a DPM-Solver.
331
+ We support both the noise prediction model ("predicting epsilon") and the data prediction model ("predicting x0").
332
+ If `predict_x0` is False, we use the solver for the noise prediction model (DPM-Solver).
333
+ If `predict_x0` is True, we use the solver for the data prediction model (DPM-Solver++).
334
+ In such case, we further support the "dynamic thresholding" in [1] when `thresholding` is True.
335
+ The "dynamic thresholding" can greatly improve the sample quality for pixel-space DPMs with large guidance scales.
336
+ Args:
337
+ model_fn: A noise prediction model function which accepts the continuous-time input (t in [epsilon, T]):
338
+ ``
339
+ def model_fn(x, t_continuous):
340
+ return noise
341
+ ``
342
+ noise_schedule: A noise schedule object, such as NoiseScheduleVP.
343
+ predict_x0: A `bool`. If true, use the data prediction model; else, use the noise prediction model.
344
+ thresholding: A `bool`. Valid when `predict_x0` is True. Whether to use the "dynamic thresholding" in [1].
345
+ max_val: A `float`. Valid when both `predict_x0` and `thresholding` are True. The max value for thresholding.
346
+
347
+ [1] Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour, Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b.
348
+ """
349
+ self.model = model_fn
350
+ self.noise_schedule = noise_schedule
351
+ self.predict_x0 = predict_x0
352
+ self.thresholding = thresholding
353
+ self.max_val = max_val
354
+
355
+ def noise_prediction_fn(self, x, t):
356
+ """
357
+ Return the noise prediction model.
358
+ """
359
+ return self.model(x, t)
360
+
361
+ def data_prediction_fn(self, x, t):
362
+ """
363
+ Return the data prediction model (with thresholding).
364
+ """
365
+ noise = self.noise_prediction_fn(x, t)
366
+ dims = x.dim()
367
+ alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
368
+ x0 = (x - expand_dims(sigma_t, dims) * noise) / expand_dims(alpha_t, dims)
369
+ if self.thresholding:
370
+ p = 0.995 # A hyperparameter in the paper of "Imagen" [1].
371
+ s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
372
+ s = expand_dims(torch.maximum(s, self.max_val * torch.ones_like(s).to(s.device)), dims)
373
+ x0 = torch.clamp(x0, -s, s) / s
374
+ return x0
375
+
376
+ def model_fn(self, x, t):
377
+ """
378
+ Convert the model to the noise prediction model or the data prediction model.
379
+ """
380
+ if self.predict_x0:
381
+ return self.data_prediction_fn(x, t)
382
+ else:
383
+ return self.noise_prediction_fn(x, t)
384
+
385
+ def get_time_steps(self, skip_type, t_T, t_0, N, device):
386
+ """Compute the intermediate time steps for sampling.
387
+ Args:
388
+ skip_type: A `str`. The type for the spacing of the time steps. We support three types:
389
+ - 'logSNR': uniform logSNR for the time steps.
390
+ - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
391
+ - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
392
+ t_T: A `float`. The starting time of the sampling (default is T).
393
+ t_0: A `float`. The ending time of the sampling (default is epsilon).
394
+ N: A `int`. The total number of the spacing of the time steps.
395
+ device: A torch device.
396
+ Returns:
397
+ A pytorch tensor of the time steps, with the shape (N + 1,).
398
+ """
399
+ if skip_type == 'logSNR':
400
+ lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
401
+ lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
402
+ logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device)
403
+ return self.noise_schedule.inverse_lambda(logSNR_steps)
404
+ elif skip_type == 'time_uniform':
405
+ return torch.linspace(t_T, t_0, N + 1).to(device)
406
+ elif skip_type == 'time_quadratic':
407
+ t_order = 2
408
+ t = torch.linspace(t_T ** (1. / t_order), t_0 ** (1. / t_order), N + 1).pow(t_order).to(device)
409
+ return t
410
+ else:
411
+ raise ValueError(
412
+ "Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type))
413
+
414
+ def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
415
+ """
416
+ Get the order of each step for sampling by the singlestep DPM-Solver.
417
+ We combine both DPM-Solver-1,2,3 to use all the function evaluations, which is named as "DPM-Solver-fast".
418
+ Given a fixed number of function evaluations by `steps`, the sampling procedure by DPM-Solver-fast is:
419
+ - If order == 1:
420
+ We take `steps` of DPM-Solver-1 (i.e. DDIM).
421
+ - If order == 2:
422
+ - Denote K = (steps // 2). We take K or (K + 1) intermediate time steps for sampling.
423
+ - If steps % 2 == 0, we use K steps of DPM-Solver-2.
424
+ - If steps % 2 == 1, we use K steps of DPM-Solver-2 and 1 step of DPM-Solver-1.
425
+ - If order == 3:
426
+ - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
427
+ - If steps % 3 == 0, we use (K - 2) steps of DPM-Solver-3, and 1 step of DPM-Solver-2 and 1 step of DPM-Solver-1.
428
+ - If steps % 3 == 1, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-1.
429
+ - If steps % 3 == 2, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-2.
430
+ ============================================
431
+ Args:
432
+ order: A `int`. The max order for the solver (2 or 3).
433
+ steps: A `int`. The total number of function evaluations (NFE).
434
+ skip_type: A `str`. The type for the spacing of the time steps. We support three types:
435
+ - 'logSNR': uniform logSNR for the time steps.
436
+ - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
437
+ - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
438
+ t_T: A `float`. The starting time of the sampling (default is T).
439
+ t_0: A `float`. The ending time of the sampling (default is epsilon).
440
+ device: A torch device.
441
+ Returns:
442
+ orders: A list of the solver order of each step.
443
+ """
444
+ if order == 3:
445
+ K = steps // 3 + 1
446
+ if steps % 3 == 0:
447
+ orders = [3, ] * (K - 2) + [2, 1]
448
+ elif steps % 3 == 1:
449
+ orders = [3, ] * (K - 1) + [1]
450
+ else:
451
+ orders = [3, ] * (K - 1) + [2]
452
+ elif order == 2:
453
+ if steps % 2 == 0:
454
+ K = steps // 2
455
+ orders = [2, ] * K
456
+ else:
457
+ K = steps // 2 + 1
458
+ orders = [2, ] * (K - 1) + [1]
459
+ elif order == 1:
460
+ K = 1
461
+ orders = [1, ] * steps
462
+ else:
463
+ raise ValueError("'order' must be '1' or '2' or '3'.")
464
+ if skip_type == 'logSNR':
465
+ # To reproduce the results in DPM-Solver paper
466
+ timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
467
+ else:
468
+ timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[
469
+ torch.cumsum(torch.tensor([0, ] + orders)).to(device)]
470
+ return timesteps_outer, orders
471
+
472
+ def denoise_to_zero_fn(self, x, s):
473
+ """
474
+ Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
475
+ """
476
+ return self.data_prediction_fn(x, s)
477
+
478
+ def dpm_solver_first_update(self, x, s, t, model_s=None, return_intermediate=False):
479
+ """
480
+ DPM-Solver-1 (equivalent to DDIM) from time `s` to time `t`.
481
+ Args:
482
+ x: A pytorch tensor. The initial value at time `s`.
483
+ s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
484
+ t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
485
+ model_s: A pytorch tensor. The model function evaluated at time `s`.
486
+ If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
487
+ return_intermediate: A `bool`. If true, also return the model value at time `s`.
488
+ Returns:
489
+ x_t: A pytorch tensor. The approximated solution at time `t`.
490
+ """
491
+ ns = self.noise_schedule
492
+ dims = x.dim()
493
+ lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
494
+ h = lambda_t - lambda_s
495
+ log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t)
496
+ sigma_s, sigma_t = ns.marginal_std(s), ns.marginal_std(t)
497
+ alpha_t = torch.exp(log_alpha_t)
498
+
499
+ if self.predict_x0:
500
+ phi_1 = torch.expm1(-h)
501
+ if model_s is None:
502
+ model_s = self.model_fn(x, s)
503
+ x_t = (
504
+ expand_dims(sigma_t / sigma_s, dims) * x
505
+ - expand_dims(alpha_t * phi_1, dims) * model_s
506
+ )
507
+ if return_intermediate:
508
+ return x_t, {'model_s': model_s}
509
+ else:
510
+ return x_t
511
+ else:
512
+ phi_1 = torch.expm1(h)
513
+ if model_s is None:
514
+ model_s = self.model_fn(x, s)
515
+ x_t = (
516
+ expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
517
+ - expand_dims(sigma_t * phi_1, dims) * model_s
518
+ )
519
+ if return_intermediate:
520
+ return x_t, {'model_s': model_s}
521
+ else:
522
+ return x_t
523
+
524
+ def singlestep_dpm_solver_second_update(self, x, s, t, r1=0.5, model_s=None, return_intermediate=False,
525
+ solver_type='dpm_solver'):
526
+ """
527
+ Singlestep solver DPM-Solver-2 from time `s` to time `t`.
528
+ Args:
529
+ x: A pytorch tensor. The initial value at time `s`.
530
+ s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
531
+ t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
532
+ r1: A `float`. The hyperparameter of the second-order solver.
533
+ model_s: A pytorch tensor. The model function evaluated at time `s`.
534
+ If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
535
+ return_intermediate: A `bool`. If true, also return the model value at time `s` and `s1` (the intermediate time).
536
+ solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
537
+ The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
538
+ Returns:
539
+ x_t: A pytorch tensor. The approximated solution at time `t`.
540
+ """
541
+ if solver_type not in ['dpm_solver', 'taylor']:
542
+ raise ValueError("'solver_type' must be either 'dpm_solver' or 'taylor', got {}".format(solver_type))
543
+ if r1 is None:
544
+ r1 = 0.5
545
+ ns = self.noise_schedule
546
+ dims = x.dim()
547
+ lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
548
+ h = lambda_t - lambda_s
549
+ lambda_s1 = lambda_s + r1 * h
550
+ s1 = ns.inverse_lambda(lambda_s1)
551
+ log_alpha_s, log_alpha_s1, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(
552
+ s1), ns.marginal_log_mean_coeff(t)
553
+ sigma_s, sigma_s1, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(t)
554
+ alpha_s1, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_t)
555
+
556
+ if self.predict_x0:
557
+ phi_11 = torch.expm1(-r1 * h)
558
+ phi_1 = torch.expm1(-h)
559
+
560
+ if model_s is None:
561
+ model_s = self.model_fn(x, s)
562
+ x_s1 = (
563
+ expand_dims(sigma_s1 / sigma_s, dims) * x
564
+ - expand_dims(alpha_s1 * phi_11, dims) * model_s
565
+ )
566
+ model_s1 = self.model_fn(x_s1, s1)
567
+ if solver_type == 'dpm_solver':
568
+ x_t = (
569
+ expand_dims(sigma_t / sigma_s, dims) * x
570
+ - expand_dims(alpha_t * phi_1, dims) * model_s
571
+ - (0.5 / r1) * expand_dims(alpha_t * phi_1, dims) * (model_s1 - model_s)
572
+ )
573
+ elif solver_type == 'taylor':
574
+ x_t = (
575
+ expand_dims(sigma_t / sigma_s, dims) * x
576
+ - expand_dims(alpha_t * phi_1, dims) * model_s
577
+ + (1. / r1) * expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * (
578
+ model_s1 - model_s)
579
+ )
580
+ else:
581
+ phi_11 = torch.expm1(r1 * h)
582
+ phi_1 = torch.expm1(h)
583
+
584
+ if model_s is None:
585
+ model_s = self.model_fn(x, s)
586
+ x_s1 = (
587
+ expand_dims(torch.exp(log_alpha_s1 - log_alpha_s), dims) * x
588
+ - expand_dims(sigma_s1 * phi_11, dims) * model_s
589
+ )
590
+ model_s1 = self.model_fn(x_s1, s1)
591
+ if solver_type == 'dpm_solver':
592
+ x_t = (
593
+ expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
594
+ - expand_dims(sigma_t * phi_1, dims) * model_s
595
+ - (0.5 / r1) * expand_dims(sigma_t * phi_1, dims) * (model_s1 - model_s)
596
+ )
597
+ elif solver_type == 'taylor':
598
+ x_t = (
599
+ expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
600
+ - expand_dims(sigma_t * phi_1, dims) * model_s
601
+ - (1. / r1) * expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * (model_s1 - model_s)
602
+ )
603
+ if return_intermediate:
604
+ return x_t, {'model_s': model_s, 'model_s1': model_s1}
605
+ else:
606
+ return x_t
607
+
608
+ def singlestep_dpm_solver_third_update(self, x, s, t, r1=1. / 3., r2=2. / 3., model_s=None, model_s1=None,
609
+ return_intermediate=False, solver_type='dpm_solver'):
610
+ """
611
+ Singlestep solver DPM-Solver-3 from time `s` to time `t`.
612
+ Args:
613
+ x: A pytorch tensor. The initial value at time `s`.
614
+ s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
615
+ t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
616
+ r1: A `float`. The hyperparameter of the third-order solver.
617
+ r2: A `float`. The hyperparameter of the third-order solver.
618
+ model_s: A pytorch tensor. The model function evaluated at time `s`.
619
+ If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it.
620
+ model_s1: A pytorch tensor. The model function evaluated at time `s1` (the intermediate time given by `r1`).
621
+ If `model_s1` is None, we evaluate the model at `s1`; otherwise we directly use it.
622
+ return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
623
+ solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
624
+ The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
625
+ Returns:
626
+ x_t: A pytorch tensor. The approximated solution at time `t`.
627
+ """
628
+ if solver_type not in ['dpm_solver', 'taylor']:
629
+ raise ValueError("'solver_type' must be either 'dpm_solver' or 'taylor', got {}".format(solver_type))
630
+ if r1 is None:
631
+ r1 = 1. / 3.
632
+ if r2 is None:
633
+ r2 = 2. / 3.
634
+ ns = self.noise_schedule
635
+ dims = x.dim()
636
+ lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
637
+ h = lambda_t - lambda_s
638
+ lambda_s1 = lambda_s + r1 * h
639
+ lambda_s2 = lambda_s + r2 * h
640
+ s1 = ns.inverse_lambda(lambda_s1)
641
+ s2 = ns.inverse_lambda(lambda_s2)
642
+ log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = ns.marginal_log_mean_coeff(
643
+ s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(s2), ns.marginal_log_mean_coeff(t)
644
+ sigma_s, sigma_s1, sigma_s2, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(
645
+ s2), ns.marginal_std(t)
646
+ alpha_s1, alpha_s2, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_s2), torch.exp(log_alpha_t)
647
+
648
+ if self.predict_x0:
649
+ phi_11 = torch.expm1(-r1 * h)
650
+ phi_12 = torch.expm1(-r2 * h)
651
+ phi_1 = torch.expm1(-h)
652
+ phi_22 = torch.expm1(-r2 * h) / (r2 * h) + 1.
653
+ phi_2 = phi_1 / h + 1.
654
+ phi_3 = phi_2 / h - 0.5
655
+
656
+ if model_s is None:
657
+ model_s = self.model_fn(x, s)
658
+ if model_s1 is None:
659
+ x_s1 = (
660
+ expand_dims(sigma_s1 / sigma_s, dims) * x
661
+ - expand_dims(alpha_s1 * phi_11, dims) * model_s
662
+ )
663
+ model_s1 = self.model_fn(x_s1, s1)
664
+ x_s2 = (
665
+ expand_dims(sigma_s2 / sigma_s, dims) * x
666
+ - expand_dims(alpha_s2 * phi_12, dims) * model_s
667
+ + r2 / r1 * expand_dims(alpha_s2 * phi_22, dims) * (model_s1 - model_s)
668
+ )
669
+ model_s2 = self.model_fn(x_s2, s2)
670
+ if solver_type == 'dpm_solver':
671
+ x_t = (
672
+ expand_dims(sigma_t / sigma_s, dims) * x
673
+ - expand_dims(alpha_t * phi_1, dims) * model_s
674
+ + (1. / r2) * expand_dims(alpha_t * phi_2, dims) * (model_s2 - model_s)
675
+ )
676
+ elif solver_type == 'taylor':
677
+ D1_0 = (1. / r1) * (model_s1 - model_s)
678
+ D1_1 = (1. / r2) * (model_s2 - model_s)
679
+ D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
680
+ D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
681
+ x_t = (
682
+ expand_dims(sigma_t / sigma_s, dims) * x
683
+ - expand_dims(alpha_t * phi_1, dims) * model_s
684
+ + expand_dims(alpha_t * phi_2, dims) * D1
685
+ - expand_dims(alpha_t * phi_3, dims) * D2
686
+ )
687
+ else:
688
+ phi_11 = torch.expm1(r1 * h)
689
+ phi_12 = torch.expm1(r2 * h)
690
+ phi_1 = torch.expm1(h)
691
+ phi_22 = torch.expm1(r2 * h) / (r2 * h) - 1.
692
+ phi_2 = phi_1 / h - 1.
693
+ phi_3 = phi_2 / h - 0.5
694
+
695
+ if model_s is None:
696
+ model_s = self.model_fn(x, s)
697
+ if model_s1 is None:
698
+ x_s1 = (
699
+ expand_dims(torch.exp(log_alpha_s1 - log_alpha_s), dims) * x
700
+ - expand_dims(sigma_s1 * phi_11, dims) * model_s
701
+ )
702
+ model_s1 = self.model_fn(x_s1, s1)
703
+ x_s2 = (
704
+ expand_dims(torch.exp(log_alpha_s2 - log_alpha_s), dims) * x
705
+ - expand_dims(sigma_s2 * phi_12, dims) * model_s
706
+ - r2 / r1 * expand_dims(sigma_s2 * phi_22, dims) * (model_s1 - model_s)
707
+ )
708
+ model_s2 = self.model_fn(x_s2, s2)
709
+ if solver_type == 'dpm_solver':
710
+ x_t = (
711
+ expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
712
+ - expand_dims(sigma_t * phi_1, dims) * model_s
713
+ - (1. / r2) * expand_dims(sigma_t * phi_2, dims) * (model_s2 - model_s)
714
+ )
715
+ elif solver_type == 'taylor':
716
+ D1_0 = (1. / r1) * (model_s1 - model_s)
717
+ D1_1 = (1. / r2) * (model_s2 - model_s)
718
+ D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
719
+ D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
720
+ x_t = (
721
+ expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x
722
+ - expand_dims(sigma_t * phi_1, dims) * model_s
723
+ - expand_dims(sigma_t * phi_2, dims) * D1
724
+ - expand_dims(sigma_t * phi_3, dims) * D2
725
+ )
726
+
727
+ if return_intermediate:
728
+ return x_t, {'model_s': model_s, 'model_s1': model_s1, 'model_s2': model_s2}
729
+ else:
730
+ return x_t
731
+
732
+ def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t, solver_type="dpm_solver"):
733
+ """
734
+ Multistep solver DPM-Solver-2 from time `t_prev_list[-1]` to time `t`.
735
+ Args:
736
+ x: A pytorch tensor. The initial value at time `s`.
737
+ model_prev_list: A list of pytorch tensor. The previous computed model values.
738
+ t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (x.shape[0],)
739
+ t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
740
+ solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
741
+ The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
742
+ Returns:
743
+ x_t: A pytorch tensor. The approximated solution at time `t`.
744
+ """
745
+ if solver_type not in ['dpm_solver', 'taylor']:
746
+ raise ValueError("'solver_type' must be either 'dpm_solver' or 'taylor', got {}".format(solver_type))
747
+ ns = self.noise_schedule
748
+ dims = x.dim()
749
+ model_prev_1, model_prev_0 = model_prev_list
750
+ t_prev_1, t_prev_0 = t_prev_list
751
+ lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_1), ns.marginal_lambda(
752
+ t_prev_0), ns.marginal_lambda(t)
753
+ log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
754
+ sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
755
+ alpha_t = torch.exp(log_alpha_t)
756
+
757
+ h_0 = lambda_prev_0 - lambda_prev_1
758
+ h = lambda_t - lambda_prev_0
759
+ r0 = h_0 / h
760
+ D1_0 = expand_dims(1. / r0, dims) * (model_prev_0 - model_prev_1)
761
+ if self.predict_x0:
762
+ if solver_type == 'dpm_solver':
763
+ x_t = (
764
+ expand_dims(sigma_t / sigma_prev_0, dims) * x
765
+ - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
766
+ - 0.5 * expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * D1_0
767
+ )
768
+ elif solver_type == 'taylor':
769
+ x_t = (
770
+ expand_dims(sigma_t / sigma_prev_0, dims) * x
771
+ - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
772
+ + expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * D1_0
773
+ )
774
+ else:
775
+ if solver_type == 'dpm_solver':
776
+ x_t = (
777
+ expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
778
+ - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
779
+ - 0.5 * expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * D1_0
780
+ )
781
+ elif solver_type == 'taylor':
782
+ x_t = (
783
+ expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
784
+ - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
785
+ - expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * D1_0
786
+ )
787
+ return x_t
788
+
789
+ def multistep_dpm_solver_third_update(self, x, model_prev_list, t_prev_list, t, solver_type='dpm_solver'):
790
+ """
791
+ Multistep solver DPM-Solver-3 from time `t_prev_list[-1]` to time `t`.
792
+ Args:
793
+ x: A pytorch tensor. The initial value at time `s`.
794
+ model_prev_list: A list of pytorch tensor. The previous computed model values.
795
+ t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (x.shape[0],)
796
+ t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
797
+ solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
798
+ The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
799
+ Returns:
800
+ x_t: A pytorch tensor. The approximated solution at time `t`.
801
+ """
802
+ ns = self.noise_schedule
803
+ dims = x.dim()
804
+ model_prev_2, model_prev_1, model_prev_0 = model_prev_list
805
+ t_prev_2, t_prev_1, t_prev_0 = t_prev_list
806
+ lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_2), ns.marginal_lambda(
807
+ t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t)
808
+ log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
809
+ sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
810
+ alpha_t = torch.exp(log_alpha_t)
811
+
812
+ h_1 = lambda_prev_1 - lambda_prev_2
813
+ h_0 = lambda_prev_0 - lambda_prev_1
814
+ h = lambda_t - lambda_prev_0
815
+ r0, r1 = h_0 / h, h_1 / h
816
+ D1_0 = expand_dims(1. / r0, dims) * (model_prev_0 - model_prev_1)
817
+ D1_1 = expand_dims(1. / r1, dims) * (model_prev_1 - model_prev_2)
818
+ D1 = D1_0 + expand_dims(r0 / (r0 + r1), dims) * (D1_0 - D1_1)
819
+ D2 = expand_dims(1. / (r0 + r1), dims) * (D1_0 - D1_1)
820
+ if self.predict_x0:
821
+ x_t = (
822
+ expand_dims(sigma_t / sigma_prev_0, dims) * x
823
+ - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0
824
+ + expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * D1
825
+ - expand_dims(alpha_t * ((torch.exp(-h) - 1. + h) / h ** 2 - 0.5), dims) * D2
826
+ )
827
+ else:
828
+ x_t = (
829
+ expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
830
+ - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0
831
+ - expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * D1
832
+ - expand_dims(sigma_t * ((torch.exp(h) - 1. - h) / h ** 2 - 0.5), dims) * D2
833
+ )
834
+ return x_t
835
+
836
+ def singlestep_dpm_solver_update(self, x, s, t, order, return_intermediate=False, solver_type='dpm_solver', r1=None,
837
+ r2=None):
838
+ """
839
+ Singlestep DPM-Solver with the order `order` from time `s` to time `t`.
840
+ Args:
841
+ x: A pytorch tensor. The initial value at time `s`.
842
+ s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
843
+ t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
844
+ order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
845
+ return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times).
846
+ solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
847
+ The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
848
+ r1: A `float`. The hyperparameter of the second-order or third-order solver.
849
+ r2: A `float`. The hyperparameter of the third-order solver.
850
+ Returns:
851
+ x_t: A pytorch tensor. The approximated solution at time `t`.
852
+ """
853
+ if order == 1:
854
+ return self.dpm_solver_first_update(x, s, t, return_intermediate=return_intermediate)
855
+ elif order == 2:
856
+ return self.singlestep_dpm_solver_second_update(x, s, t, return_intermediate=return_intermediate,
857
+ solver_type=solver_type, r1=r1)
858
+ elif order == 3:
859
+ return self.singlestep_dpm_solver_third_update(x, s, t, return_intermediate=return_intermediate,
860
+ solver_type=solver_type, r1=r1, r2=r2)
861
+ else:
862
+ raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
863
+
864
+ def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, solver_type='dpm_solver'):
865
+ """
866
+ Multistep DPM-Solver with the order `order` from time `t_prev_list[-1]` to time `t`.
867
+ Args:
868
+ x: A pytorch tensor. The initial value at time `s`.
869
+ model_prev_list: A list of pytorch tensor. The previous computed model values.
870
+ t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (x.shape[0],)
871
+ t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
872
+ order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
873
+ solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
874
+ The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
875
+ Returns:
876
+ x_t: A pytorch tensor. The approximated solution at time `t`.
877
+ """
878
+ if order == 1:
879
+ return self.dpm_solver_first_update(x, t_prev_list[-1], t, model_s=model_prev_list[-1])
880
+ elif order == 2:
881
+ return self.multistep_dpm_solver_second_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
882
+ elif order == 3:
883
+ return self.multistep_dpm_solver_third_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type)
884
+ else:
885
+ raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
886
+
887
+ def dpm_solver_adaptive(self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol=0.05, theta=0.9, t_err=1e-5,
888
+ solver_type='dpm_solver'):
889
+ """
890
+ The adaptive step size solver based on singlestep DPM-Solver.
891
+ Args:
892
+ x: A pytorch tensor. The initial value at time `t_T`.
893
+ order: A `int`. The (higher) order of the solver. We only support order == 2 or 3.
894
+ t_T: A `float`. The starting time of the sampling (default is T).
895
+ t_0: A `float`. The ending time of the sampling (default is epsilon).
896
+ h_init: A `float`. The initial step size (for logSNR).
897
+ atol: A `float`. The absolute tolerance of the solver. For image data, the default setting is 0.0078, followed [1].
898
+ rtol: A `float`. The relative tolerance of the solver. The default setting is 0.05.
899
+ theta: A `float`. The safety hyperparameter for adapting the step size. The default setting is 0.9, followed [1].
900
+ t_err: A `float`. The tolerance for the time. We solve the diffusion ODE until the absolute error between the
901
+ current time and `t_0` is less than `t_err`. The default setting is 1e-5.
902
+ solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers.
903
+ The type slightly impacts the performance. We recommend to use 'dpm_solver' type.
904
+ Returns:
905
+ x_0: A pytorch tensor. The approximated solution at time `t_0`.
906
+ [1] A. Jolicoeur-Martineau, K. Li, R. Piché-Taillefer, T. Kachman, and I. Mitliagkas, "Gotta go fast when generating data with score-based models," arXiv preprint arXiv:2105.14080, 2021.
907
+ """
908
+ ns = self.noise_schedule
909
+ s = t_T * torch.ones((x.shape[0],)).to(x)
910
+ lambda_s = ns.marginal_lambda(s)
911
+ lambda_0 = ns.marginal_lambda(t_0 * torch.ones_like(s).to(x))
912
+ h = h_init * torch.ones_like(s).to(x)
913
+ x_prev = x
914
+ nfe = 0
915
+ if order == 2:
916
+ r1 = 0.5
917
+ lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True)
918
+ higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1,
919
+ solver_type=solver_type,
920
+ **kwargs)
921
+ elif order == 3:
922
+ r1, r2 = 1. / 3., 2. / 3.
923
+ lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1,
924
+ return_intermediate=True,
925
+ solver_type=solver_type)
926
+ higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2,
927
+ solver_type=solver_type,
928
+ **kwargs)
929
+ else:
930
+ raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order))
931
+ while torch.abs((s - t_0)).mean() > t_err:
932
+ t = ns.inverse_lambda(lambda_s + h)
933
+ x_lower, lower_noise_kwargs = lower_update(x, s, t)
934
+ x_higher = higher_update(x, s, t, **lower_noise_kwargs)
935
+ delta = torch.max(torch.ones_like(x).to(x) * atol, rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev)))
936
+ norm_fn = lambda v: torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True))
937
+ E = norm_fn((x_higher - x_lower) / delta).max()
938
+ if torch.all(E <= 1.):
939
+ x = x_higher
940
+ s = t
941
+ x_prev = x_lower
942
+ lambda_s = ns.marginal_lambda(s)
943
+ h = torch.min(theta * h * torch.float_power(E, -1. / order).float(), lambda_0 - lambda_s)
944
+ nfe += order
945
+ print('adaptive solver nfe', nfe)
946
+ return x
947
+
948
+ def sample(self, x, steps=20, t_start=None, t_end=None, order=3, skip_type='time_uniform',
949
+ method='singlestep', lower_order_final=True, denoise_to_zero=False, solver_type='dpm_solver',
950
+ atol=0.0078, rtol=0.05,
951
+ ):
952
+ """
953
+ Compute the sample at time `t_end` by DPM-Solver, given the initial `x` at time `t_start`.
954
+ =====================================================
955
+ We support the following algorithms for both noise prediction model and data prediction model:
956
+ - 'singlestep':
957
+ Singlestep DPM-Solver (i.e. "DPM-Solver-fast" in the paper), which combines different orders of singlestep DPM-Solver.
958
+ We combine all the singlestep solvers with order <= `order` to use up all the function evaluations (steps).
959
+ The total number of function evaluations (NFE) == `steps`.
960
+ Given a fixed NFE == `steps`, the sampling procedure is:
961
+ - If `order` == 1:
962
+ - Denote K = steps. We use K steps of DPM-Solver-1 (i.e. DDIM).
963
+ - If `order` == 2:
964
+ - Denote K = (steps // 2) + (steps % 2). We take K intermediate time steps for sampling.
965
+ - If steps % 2 == 0, we use K steps of singlestep DPM-Solver-2.
966
+ - If steps % 2 == 1, we use (K - 1) steps of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1.
967
+ - If `order` == 3:
968
+ - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
969
+ - If steps % 3 == 0, we use (K - 2) steps of singlestep DPM-Solver-3, and 1 step of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1.
970
+ - If steps % 3 == 1, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of DPM-Solver-1.
971
+ - If steps % 3 == 2, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of singlestep DPM-Solver-2.
972
+ - 'multistep':
973
+ Multistep DPM-Solver with the order of `order`. The total number of function evaluations (NFE) == `steps`.
974
+ We initialize the first `order` values by lower order multistep solvers.
975
+ Given a fixed NFE == `steps`, the sampling procedure is:
976
+ Denote K = steps.
977
+ - If `order` == 1:
978
+ - We use K steps of DPM-Solver-1 (i.e. DDIM).
979
+ - If `order` == 2:
980
+ - We firstly use 1 step of DPM-Solver-1, then use (K - 1) step of multistep DPM-Solver-2.
981
+ - If `order` == 3:
982
+ - We firstly use 1 step of DPM-Solver-1, then 1 step of multistep DPM-Solver-2, then (K - 2) step of multistep DPM-Solver-3.
983
+ - 'singlestep_fixed':
984
+ Fixed order singlestep DPM-Solver (i.e. DPM-Solver-1 or singlestep DPM-Solver-2 or singlestep DPM-Solver-3).
985
+ We use singlestep DPM-Solver-`order` for `order`=1 or 2 or 3, with total [`steps` // `order`] * `order` NFE.
986
+ - 'adaptive':
987
+ Adaptive step size DPM-Solver (i.e. "DPM-Solver-12" and "DPM-Solver-23" in the paper).
988
+ We ignore `steps` and use adaptive step size DPM-Solver with a higher order of `order`.
989
+ You can adjust the absolute tolerance `atol` and the relative tolerance `rtol` to balance the computatation costs
990
+ (NFE) and the sample quality.
991
+ - If `order` == 2, we use DPM-Solver-12 which combines DPM-Solver-1 and singlestep DPM-Solver-2.
992
+ - If `order` == 3, we use DPM-Solver-23 which combines singlestep DPM-Solver-2 and singlestep DPM-Solver-3.
993
+ =====================================================
994
+ Some advices for choosing the algorithm:
995
+ - For **unconditional sampling** or **guided sampling with small guidance scale** by DPMs:
996
+ Use singlestep DPM-Solver ("DPM-Solver-fast" in the paper) with `order = 3`.
997
+ e.g.
998
+ >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, predict_x0=False)
999
+ >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3,
1000
+ skip_type='time_uniform', method='singlestep')
1001
+ - For **guided sampling with large guidance scale** by DPMs:
1002
+ Use multistep DPM-Solver with `predict_x0 = True` and `order = 2`.
1003
+ e.g.
1004
+ >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, predict_x0=True)
1005
+ >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=2,
1006
+ skip_type='time_uniform', method='multistep')
1007
+ We support three types of `skip_type`:
1008
+ - 'logSNR': uniform logSNR for the time steps. **Recommended for low-resolutional images**
1009
+ - 'time_uniform': uniform time for the time steps. **Recommended for high-resolutional images**.
1010
+ - 'time_quadratic': quadratic time for the time steps.
1011
+ =====================================================
1012
+ Args:
1013
+ x: A pytorch tensor. The initial value at time `t_start`
1014
+ e.g. if `t_start` == T, then `x` is a sample from the standard normal distribution.
1015
+ steps: A `int`. The total number of function evaluations (NFE).
1016
+ t_start: A `float`. The starting time of the sampling.
1017
+ If `T` is None, we use self.noise_schedule.T (default is 1.0).
1018
+ t_end: A `float`. The ending time of the sampling.
1019
+ If `t_end` is None, we use 1. / self.noise_schedule.total_N.
1020
+ e.g. if total_N == 1000, we have `t_end` == 1e-3.
1021
+ For discrete-time DPMs:
1022
+ - We recommend `t_end` == 1. / self.noise_schedule.total_N.
1023
+ For continuous-time DPMs:
1024
+ - We recommend `t_end` == 1e-3 when `steps` <= 15; and `t_end` == 1e-4 when `steps` > 15.
1025
+ order: A `int`. The order of DPM-Solver.
1026
+ skip_type: A `str`. The type for the spacing of the time steps. 'time_uniform' or 'logSNR' or 'time_quadratic'.
1027
+ method: A `str`. The method for sampling. 'singlestep' or 'multistep' or 'singlestep_fixed' or 'adaptive'.
1028
+ denoise_to_zero: A `bool`. Whether to denoise to time 0 at the final step.
1029
+ Default is `False`. If `denoise_to_zero` is `True`, the total NFE is (`steps` + 1).
1030
+ This trick is firstly proposed by DDPM (https://arxiv.org/abs/2006.11239) and
1031
+ score_sde (https://arxiv.org/abs/2011.13456). Such trick can improve the FID
1032
+ for diffusion models sampling by diffusion SDEs for low-resolutional images
1033
+ (such as CIFAR-10). However, we observed that such trick does not matter for
1034
+ high-resolutional images. As it needs an additional NFE, we do not recommend
1035
+ it for high-resolutional images.
1036
+ lower_order_final: A `bool`. Whether to use lower order solvers at the final steps.
1037
+ Only valid for `method=multistep` and `steps < 15`. We empirically find that
1038
+ this trick is a key to stabilizing the sampling by DPM-Solver with very few steps
1039
+ (especially for steps <= 10). So we recommend to set it to be `True`.
1040
+ solver_type: A `str`. The taylor expansion type for the solver. `dpm_solver` or `taylor`. We recommend `dpm_solver`.
1041
+ atol: A `float`. The absolute tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
1042
+ rtol: A `float`. The relative tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'.
1043
+ Returns:
1044
+ x_end: A pytorch tensor. The approximated solution at time `t_end`.
1045
+ """
1046
+ t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
1047
+ t_T = self.noise_schedule.T if t_start is None else t_start
1048
+ device = x.device
1049
+ if method == 'adaptive':
1050
+ with torch.no_grad():
1051
+ x = self.dpm_solver_adaptive(x, order=order, t_T=t_T, t_0=t_0, atol=atol, rtol=rtol,
1052
+ solver_type=solver_type)
1053
+ elif method == 'multistep':
1054
+ assert steps >= order
1055
+ timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
1056
+ assert timesteps.shape[0] - 1 == steps
1057
+ with torch.no_grad():
1058
+ vec_t = timesteps[0].expand((x.shape[0]))
1059
+ model_prev_list = [self.model_fn(x, vec_t)]
1060
+ t_prev_list = [vec_t]
1061
+ # Init the first `order` values by lower order multistep DPM-Solver.
1062
+ for init_order in tqdm(range(1, order), desc="DPM init order"):
1063
+ vec_t = timesteps[init_order].expand(x.shape[0])
1064
+ x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, vec_t, init_order,
1065
+ solver_type=solver_type)
1066
+ model_prev_list.append(self.model_fn(x, vec_t))
1067
+ t_prev_list.append(vec_t)
1068
+ # Compute the remaining values by `order`-th order multistep DPM-Solver.
1069
+ for step in tqdm(range(order, steps + 1), desc="DPM multistep"):
1070
+ vec_t = timesteps[step].expand(x.shape[0])
1071
+ if lower_order_final and steps < 15:
1072
+ step_order = min(order, steps + 1 - step)
1073
+ else:
1074
+ step_order = order
1075
+ x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, vec_t, step_order,
1076
+ solver_type=solver_type)
1077
+ for i in range(order - 1):
1078
+ t_prev_list[i] = t_prev_list[i + 1]
1079
+ model_prev_list[i] = model_prev_list[i + 1]
1080
+ t_prev_list[-1] = vec_t
1081
+ # We do not need to evaluate the final model value.
1082
+ if step < steps:
1083
+ model_prev_list[-1] = self.model_fn(x, vec_t)
1084
+ elif method in ['singlestep', 'singlestep_fixed']:
1085
+ if method == 'singlestep':
1086
+ timesteps_outer, orders = self.get_orders_and_timesteps_for_singlestep_solver(steps=steps, order=order,
1087
+ skip_type=skip_type,
1088
+ t_T=t_T, t_0=t_0,
1089
+ device=device)
1090
+ elif method == 'singlestep_fixed':
1091
+ K = steps // order
1092
+ orders = [order, ] * K
1093
+ timesteps_outer = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=K, device=device)
1094
+ for i, order in enumerate(orders):
1095
+ t_T_inner, t_0_inner = timesteps_outer[i], timesteps_outer[i + 1]
1096
+ timesteps_inner = self.get_time_steps(skip_type=skip_type, t_T=t_T_inner.item(), t_0=t_0_inner.item(),
1097
+ N=order, device=device)
1098
+ lambda_inner = self.noise_schedule.marginal_lambda(timesteps_inner)
1099
+ vec_s, vec_t = t_T_inner.tile(x.shape[0]), t_0_inner.tile(x.shape[0])
1100
+ h = lambda_inner[-1] - lambda_inner[0]
1101
+ r1 = None if order <= 1 else (lambda_inner[1] - lambda_inner[0]) / h
1102
+ r2 = None if order <= 2 else (lambda_inner[2] - lambda_inner[0]) / h
1103
+ x = self.singlestep_dpm_solver_update(x, vec_s, vec_t, order, solver_type=solver_type, r1=r1, r2=r2)
1104
+ if denoise_to_zero:
1105
+ x = self.denoise_to_zero_fn(x, torch.ones((x.shape[0],)).to(device) * t_0)
1106
+ return x
1107
+
1108
+
1109
+ #############################################################
1110
+ # other utility functions
1111
+ #############################################################
1112
+
1113
+ def interpolate_fn(x, xp, yp):
1114
+ """
1115
+ A piecewise linear function y = f(x), using xp and yp as keypoints.
1116
+ We implement f(x) in a differentiable way (i.e. applicable for autograd).
1117
+ The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
1118
+ Args:
1119
+ x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
1120
+ xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
1121
+ yp: PyTorch tensor with shape [C, K].
1122
+ Returns:
1123
+ The function values f(x), with shape [N, C].
1124
+ """
1125
+ N, K = x.shape[0], xp.shape[1]
1126
+ all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
1127
+ sorted_all_x, x_indices = torch.sort(all_x, dim=2)
1128
+ x_idx = torch.argmin(x_indices, dim=2)
1129
+ cand_start_idx = x_idx - 1
1130
+ start_idx = torch.where(
1131
+ torch.eq(x_idx, 0),
1132
+ torch.tensor(1, device=x.device),
1133
+ torch.where(
1134
+ torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
1135
+ ),
1136
+ )
1137
+ end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
1138
+ start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
1139
+ end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
1140
+ start_idx2 = torch.where(
1141
+ torch.eq(x_idx, 0),
1142
+ torch.tensor(0, device=x.device),
1143
+ torch.where(
1144
+ torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
1145
+ ),
1146
+ )
1147
+ y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
1148
+ start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
1149
+ end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
1150
+ cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
1151
+ return cand
1152
+
1153
+
1154
+ def expand_dims(v, dims):
1155
+ """
1156
+ Expand the tensor `v` to the dim `dims`.
1157
+ Args:
1158
+ `v`: a PyTorch tensor with shape [N].
1159
+ `dim`: a `int`.
1160
+ Returns:
1161
+ a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
1162
+ """
1163
+ return v[(...,) + (None,) * (dims - 1)]
ldm/models/diffusion/dpm_solver/sampler.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SAMPLING ONLY."""
2
+ import torch
3
+
4
+ from .dpm_solver import NoiseScheduleVP, model_wrapper, DPM_Solver
5
+
6
+ MODEL_TYPES = {
7
+ "eps": "noise",
8
+ "v": "v"
9
+ }
10
+
11
+
12
+ class DPMSolverSampler(object):
13
+ def __init__(self, model, device=torch.device("cuda"), **kwargs):
14
+ super().__init__()
15
+ self.model = model
16
+ self.device = device
17
+ to_torch = lambda x: x.clone().detach().to(torch.float32).to(model.device)
18
+ self.register_buffer('alphas_cumprod', to_torch(model.alphas_cumprod))
19
+
20
+ def register_buffer(self, name, attr):
21
+ if type(attr) == torch.Tensor:
22
+ if attr.device != self.device:
23
+ attr = attr.to(self.device)
24
+ setattr(self, name, attr)
25
+
26
+ @torch.no_grad()
27
+ def sample(self,
28
+ S,
29
+ batch_size,
30
+ shape,
31
+ conditioning=None,
32
+ callback=None,
33
+ normals_sequence=None,
34
+ img_callback=None,
35
+ quantize_x0=False,
36
+ eta=0.,
37
+ mask=None,
38
+ x0=None,
39
+ temperature=1.,
40
+ noise_dropout=0.,
41
+ score_corrector=None,
42
+ corrector_kwargs=None,
43
+ verbose=True,
44
+ x_T=None,
45
+ log_every_t=100,
46
+ unconditional_guidance_scale=1.,
47
+ unconditional_conditioning=None,
48
+ # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
49
+ **kwargs
50
+ ):
51
+ if conditioning is not None:
52
+ if isinstance(conditioning, dict):
53
+ ctmp = conditioning[list(conditioning.keys())[0]]
54
+ while isinstance(ctmp, list): ctmp = ctmp[0]
55
+ if isinstance(ctmp, torch.Tensor):
56
+ cbs = ctmp.shape[0]
57
+ if cbs != batch_size:
58
+ print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
59
+ elif isinstance(conditioning, list):
60
+ for ctmp in conditioning:
61
+ if ctmp.shape[0] != batch_size:
62
+ print(f"Warning: Got {ctmp.shape[0]} conditionings but batch-size is {batch_size}")
63
+ else:
64
+ if isinstance(conditioning, torch.Tensor):
65
+ if conditioning.shape[0] != batch_size:
66
+ print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
67
+
68
+ # sampling
69
+ C, H, W = shape
70
+ size = (batch_size, C, H, W)
71
+
72
+ print(f'Data shape for DPM-Solver sampling is {size}, sampling steps {S}')
73
+
74
+ device = self.model.betas.device
75
+ if x_T is None:
76
+ img = torch.randn(size, device=device)
77
+ else:
78
+ img = x_T
79
+
80
+ ns = NoiseScheduleVP('discrete', alphas_cumprod=self.alphas_cumprod)
81
+
82
+ model_fn = model_wrapper(
83
+ lambda x, t, c: self.model.apply_model(x, t, c),
84
+ ns,
85
+ model_type=MODEL_TYPES[self.model.parameterization],
86
+ guidance_type="classifier-free",
87
+ condition=conditioning,
88
+ unconditional_condition=unconditional_conditioning,
89
+ guidance_scale=unconditional_guidance_scale,
90
+ )
91
+
92
+ dpm_solver = DPM_Solver(model_fn, ns, predict_x0=True, thresholding=False)
93
+ x = dpm_solver.sample(img, steps=S, skip_type="time_uniform", method="multistep", order=2,
94
+ lower_order_final=True)
95
+
96
+ return x.to(device), None
ldm/models/diffusion/plms.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SAMPLING ONLY."""
2
+
3
+ import torch
4
+ import numpy as np
5
+ from tqdm import tqdm
6
+ from functools import partial
7
+
8
+ from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
9
+ from ldm.models.diffusion.sampling_util import norm_thresholding
10
+
11
+
12
+ class PLMSSampler(object):
13
+ def __init__(self, model, schedule="linear", device=torch.device("cuda"), **kwargs):
14
+ super().__init__()
15
+ self.model = model
16
+ self.ddpm_num_timesteps = model.num_timesteps
17
+ self.schedule = schedule
18
+ self.device = device
19
+
20
+ def register_buffer(self, name, attr):
21
+ if type(attr) == torch.Tensor:
22
+ if attr.device != self.device:
23
+ attr = attr.to(self.device)
24
+ setattr(self, name, attr)
25
+
26
+ def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
27
+ if ddim_eta != 0:
28
+ raise ValueError('ddim_eta must be 0 for PLMS')
29
+ self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
30
+ num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
31
+ alphas_cumprod = self.model.alphas_cumprod
32
+ assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
33
+ to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
34
+
35
+ self.register_buffer('betas', to_torch(self.model.betas))
36
+ self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
37
+ self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
38
+
39
+ # calculations for diffusion q(x_t | x_{t-1}) and others
40
+ self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
41
+ self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
42
+ self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
43
+ self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
44
+ self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
45
+
46
+ # ddim sampling parameters
47
+ ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
48
+ ddim_timesteps=self.ddim_timesteps,
49
+ eta=ddim_eta,verbose=verbose)
50
+ self.register_buffer('ddim_sigmas', ddim_sigmas)
51
+ self.register_buffer('ddim_alphas', ddim_alphas)
52
+ self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
53
+ self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
54
+ sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
55
+ (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
56
+ 1 - self.alphas_cumprod / self.alphas_cumprod_prev))
57
+ self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
58
+
59
+ @torch.no_grad()
60
+ def sample(self,
61
+ S,
62
+ batch_size,
63
+ shape,
64
+ conditioning=None,
65
+ callback=None,
66
+ normals_sequence=None,
67
+ img_callback=None,
68
+ quantize_x0=False,
69
+ eta=0.,
70
+ mask=None,
71
+ x0=None,
72
+ temperature=1.,
73
+ noise_dropout=0.,
74
+ score_corrector=None,
75
+ corrector_kwargs=None,
76
+ verbose=True,
77
+ x_T=None,
78
+ log_every_t=100,
79
+ unconditional_guidance_scale=1.,
80
+ unconditional_conditioning=None,
81
+ # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
82
+ dynamic_threshold=None,
83
+ **kwargs
84
+ ):
85
+ if conditioning is not None:
86
+ if isinstance(conditioning, dict):
87
+ cbs = conditioning[list(conditioning.keys())[0]].shape[0]
88
+ if cbs != batch_size:
89
+ print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
90
+ else:
91
+ if conditioning.shape[0] != batch_size:
92
+ print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
93
+
94
+ self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
95
+ # sampling
96
+ C, H, W = shape
97
+ size = (batch_size, C, H, W)
98
+ print(f'Data shape for PLMS sampling is {size}')
99
+
100
+ samples, intermediates = self.plms_sampling(conditioning, size,
101
+ callback=callback,
102
+ img_callback=img_callback,
103
+ quantize_denoised=quantize_x0,
104
+ mask=mask, x0=x0,
105
+ ddim_use_original_steps=False,
106
+ noise_dropout=noise_dropout,
107
+ temperature=temperature,
108
+ score_corrector=score_corrector,
109
+ corrector_kwargs=corrector_kwargs,
110
+ x_T=x_T,
111
+ log_every_t=log_every_t,
112
+ unconditional_guidance_scale=unconditional_guidance_scale,
113
+ unconditional_conditioning=unconditional_conditioning,
114
+ dynamic_threshold=dynamic_threshold,
115
+ )
116
+ return samples, intermediates
117
+
118
+ @torch.no_grad()
119
+ def plms_sampling(self, cond, shape,
120
+ x_T=None, ddim_use_original_steps=False,
121
+ callback=None, timesteps=None, quantize_denoised=False,
122
+ mask=None, x0=None, img_callback=None, log_every_t=100,
123
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
124
+ unconditional_guidance_scale=1., unconditional_conditioning=None,
125
+ dynamic_threshold=None):
126
+ device = self.model.betas.device
127
+ b = shape[0]
128
+ if x_T is None:
129
+ img = torch.randn(shape, device=device)
130
+ else:
131
+ img = x_T
132
+
133
+ if timesteps is None:
134
+ timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
135
+ elif timesteps is not None and not ddim_use_original_steps:
136
+ subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
137
+ timesteps = self.ddim_timesteps[:subset_end]
138
+
139
+ intermediates = {'x_inter': [img], 'pred_x0': [img]}
140
+ time_range = list(reversed(range(0,timesteps))) if ddim_use_original_steps else np.flip(timesteps)
141
+ total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
142
+ print(f"Running PLMS Sampling with {total_steps} timesteps")
143
+
144
+ iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps)
145
+ old_eps = []
146
+
147
+ for i, step in enumerate(iterator):
148
+ index = total_steps - i - 1
149
+ ts = torch.full((b,), step, device=device, dtype=torch.long)
150
+ ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long)
151
+
152
+ if mask is not None:
153
+ assert x0 is not None
154
+ img_orig = self.model.q_sample(x0, ts) # TODO: deterministic forward pass?
155
+ img = img_orig * mask + (1. - mask) * img
156
+
157
+ outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
158
+ quantize_denoised=quantize_denoised, temperature=temperature,
159
+ noise_dropout=noise_dropout, score_corrector=score_corrector,
160
+ corrector_kwargs=corrector_kwargs,
161
+ unconditional_guidance_scale=unconditional_guidance_scale,
162
+ unconditional_conditioning=unconditional_conditioning,
163
+ old_eps=old_eps, t_next=ts_next,
164
+ dynamic_threshold=dynamic_threshold)
165
+ img, pred_x0, e_t = outs
166
+ old_eps.append(e_t)
167
+ if len(old_eps) >= 4:
168
+ old_eps.pop(0)
169
+ if callback: callback(i)
170
+ if img_callback: img_callback(pred_x0, i)
171
+
172
+ if index % log_every_t == 0 or index == total_steps - 1:
173
+ intermediates['x_inter'].append(img)
174
+ intermediates['pred_x0'].append(pred_x0)
175
+
176
+ return img, intermediates
177
+
178
+ @torch.no_grad()
179
+ def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
180
+ temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
181
+ unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None,
182
+ dynamic_threshold=None):
183
+ b, *_, device = *x.shape, x.device
184
+
185
+ def get_model_output(x, t):
186
+ if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
187
+ e_t = self.model.apply_model(x, t, c)
188
+ else:
189
+ x_in = torch.cat([x] * 2)
190
+ t_in = torch.cat([t] * 2)
191
+ c_in = torch.cat([unconditional_conditioning, c])
192
+ e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
193
+ e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
194
+
195
+ if score_corrector is not None:
196
+ assert self.model.parameterization == "eps"
197
+ e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
198
+
199
+ return e_t
200
+
201
+ alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
202
+ alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
203
+ sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
204
+ sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
205
+
206
+ def get_x_prev_and_pred_x0(e_t, index):
207
+ # select parameters corresponding to the currently considered timestep
208
+ a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
209
+ a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
210
+ sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
211
+ sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
212
+
213
+ # current prediction for x_0
214
+ pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
215
+ if quantize_denoised:
216
+ pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
217
+ if dynamic_threshold is not None:
218
+ pred_x0 = norm_thresholding(pred_x0, dynamic_threshold)
219
+ # direction pointing to x_t
220
+ dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
221
+ noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
222
+ if noise_dropout > 0.:
223
+ noise = torch.nn.functional.dropout(noise, p=noise_dropout)
224
+ x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
225
+ return x_prev, pred_x0
226
+
227
+ e_t = get_model_output(x, t)
228
+ if len(old_eps) == 0:
229
+ # Pseudo Improved Euler (2nd order)
230
+ x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
231
+ e_t_next = get_model_output(x_prev, t_next)
232
+ e_t_prime = (e_t + e_t_next) / 2
233
+ elif len(old_eps) == 1:
234
+ # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
235
+ e_t_prime = (3 * e_t - old_eps[-1]) / 2
236
+ elif len(old_eps) == 2:
237
+ # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
238
+ e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
239
+ elif len(old_eps) >= 3:
240
+ # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
241
+ e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24
242
+
243
+ x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
244
+
245
+ return x_prev, pred_x0, e_t
ldm/models/diffusion/sampling_util.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+
4
+
5
+ def append_dims(x, target_dims):
6
+ """Appends dimensions to the end of a tensor until it has target_dims dimensions.
7
+ From https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/utils.py"""
8
+ dims_to_append = target_dims - x.ndim
9
+ if dims_to_append < 0:
10
+ raise ValueError(f'input has {x.ndim} dims but target_dims is {target_dims}, which is less')
11
+ return x[(...,) + (None,) * dims_to_append]
12
+
13
+
14
+ def norm_thresholding(x0, value):
15
+ s = append_dims(x0.pow(2).flatten(1).mean(1).sqrt().clamp(min=value), x0.ndim)
16
+ return x0 * (value / s)
17
+
18
+
19
+ def spatial_norm_thresholding(x0, value):
20
+ # b c h w
21
+ s = x0.pow(2).mean(1, keepdim=True).sqrt().clamp(min=value)
22
+ return x0 * (value / s)
ldm/modules/attention.py ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from inspect import isfunction
2
+ import math
3
+ import torch
4
+ import torch.nn.functional as F
5
+ from torch import nn, einsum
6
+ from einops import rearrange, repeat
7
+ from typing import Optional, Any
8
+
9
+ from ldm.modules.diffusionmodules.util import checkpoint
10
+
11
+
12
+ try:
13
+ import xformers
14
+ import xformers.ops
15
+ XFORMERS_IS_AVAILBLE = True
16
+ except:
17
+ XFORMERS_IS_AVAILBLE = False
18
+
19
+ # CrossAttn precision handling
20
+ import os
21
+ _ATTN_PRECISION = os.environ.get("ATTN_PRECISION", "fp32")
22
+
23
+ def exists(val):
24
+ return val is not None
25
+
26
+
27
+ def uniq(arr):
28
+ return{el: True for el in arr}.keys()
29
+
30
+
31
+ def default(val, d):
32
+ if exists(val):
33
+ return val
34
+ return d() if isfunction(d) else d
35
+
36
+
37
+ def max_neg_value(t):
38
+ return -torch.finfo(t.dtype).max
39
+
40
+
41
+ def init_(tensor):
42
+ dim = tensor.shape[-1]
43
+ std = 1 / math.sqrt(dim)
44
+ tensor.uniform_(-std, std)
45
+ return tensor
46
+
47
+
48
+ # feedforward
49
+ class GEGLU(nn.Module):
50
+ def __init__(self, dim_in, dim_out):
51
+ super().__init__()
52
+ self.proj = nn.Linear(dim_in, dim_out * 2)
53
+
54
+ def forward(self, x):
55
+ x, gate = self.proj(x).chunk(2, dim=-1)
56
+ return x * F.gelu(gate)
57
+
58
+
59
+ class FeedForward(nn.Module):
60
+ def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
61
+ super().__init__()
62
+ inner_dim = int(dim * mult)
63
+ dim_out = default(dim_out, dim)
64
+ project_in = nn.Sequential(
65
+ nn.Linear(dim, inner_dim),
66
+ nn.GELU()
67
+ ) if not glu else GEGLU(dim, inner_dim)
68
+
69
+ self.net = nn.Sequential(
70
+ project_in,
71
+ nn.Dropout(dropout),
72
+ nn.Linear(inner_dim, dim_out)
73
+ )
74
+
75
+ def forward(self, x):
76
+ return self.net(x)
77
+
78
+
79
+ def zero_module(module):
80
+ """
81
+ Zero out the parameters of a module and return it.
82
+ """
83
+ for p in module.parameters():
84
+ p.detach().zero_()
85
+ return module
86
+
87
+
88
+ def Normalize(in_channels):
89
+ return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
90
+
91
+
92
+ class SpatialSelfAttention(nn.Module):
93
+ def __init__(self, in_channels):
94
+ super().__init__()
95
+ self.in_channels = in_channels
96
+
97
+ self.norm = Normalize(in_channels)
98
+ self.q = torch.nn.Conv2d(in_channels,
99
+ in_channels,
100
+ kernel_size=1,
101
+ stride=1,
102
+ padding=0)
103
+ self.k = torch.nn.Conv2d(in_channels,
104
+ in_channels,
105
+ kernel_size=1,
106
+ stride=1,
107
+ padding=0)
108
+ self.v = torch.nn.Conv2d(in_channels,
109
+ in_channels,
110
+ kernel_size=1,
111
+ stride=1,
112
+ padding=0)
113
+ self.proj_out = torch.nn.Conv2d(in_channels,
114
+ in_channels,
115
+ kernel_size=1,
116
+ stride=1,
117
+ padding=0)
118
+
119
+ def forward(self, x):
120
+ h_ = x
121
+ h_ = self.norm(h_)
122
+ q = self.q(h_)
123
+ k = self.k(h_)
124
+ v = self.v(h_)
125
+
126
+ # compute attention
127
+ b,c,h,w = q.shape
128
+ q = rearrange(q, 'b c h w -> b (h w) c')
129
+ k = rearrange(k, 'b c h w -> b c (h w)')
130
+ w_ = torch.einsum('bij,bjk->bik', q, k)
131
+
132
+ w_ = w_ * (int(c)**(-0.5))
133
+ w_ = torch.nn.functional.softmax(w_, dim=2)
134
+
135
+ # attend to values
136
+ v = rearrange(v, 'b c h w -> b c (h w)')
137
+ w_ = rearrange(w_, 'b i j -> b j i')
138
+ h_ = torch.einsum('bij,bjk->bik', v, w_)
139
+ h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
140
+ h_ = self.proj_out(h_)
141
+
142
+ return x+h_
143
+
144
+
145
+ class CrossAttention(nn.Module):
146
+ def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.):
147
+ super().__init__()
148
+ inner_dim = dim_head * heads
149
+ context_dim = default(context_dim, query_dim)
150
+
151
+ self.scale = dim_head ** -0.5
152
+ self.heads = heads
153
+
154
+ self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
155
+ self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
156
+ self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
157
+
158
+ self.to_out = nn.Sequential(
159
+ nn.Linear(inner_dim, query_dim),
160
+ nn.Dropout(dropout)
161
+ )
162
+
163
+ def forward(self, x, context=None, mask=None):
164
+ h = self.heads
165
+
166
+ q = self.to_q(x)
167
+ context = default(context, x)
168
+ k = self.to_k(context)
169
+ v = self.to_v(context)
170
+
171
+ q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
172
+
173
+ # force cast to fp32 to avoid overflowing
174
+ if _ATTN_PRECISION =="fp32":
175
+ with torch.autocast(enabled=False, device_type = 'cuda'):
176
+ q, k = q.float(), k.float()
177
+ sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
178
+ else:
179
+ sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
180
+
181
+ del q, k
182
+
183
+ if exists(mask):
184
+ mask = rearrange(mask, 'b ... -> b (...)')
185
+ max_neg_value = -torch.finfo(sim.dtype).max
186
+ mask = repeat(mask, 'b j -> (b h) () j', h=h)
187
+ sim.masked_fill_(~mask, max_neg_value)
188
+
189
+ # attention, what we cannot get enough of
190
+ sim = sim.softmax(dim=-1)
191
+
192
+ out = einsum('b i j, b j d -> b i d', sim, v)
193
+ out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
194
+ return self.to_out(out)
195
+
196
+
197
+ class MemoryEfficientCrossAttention(nn.Module):
198
+ # https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
199
+ def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
200
+ super().__init__()
201
+ print(f"Setting up {self.__class__.__name__}. Query dim is {query_dim}, context_dim is {context_dim} and using "
202
+ f"{heads} heads.")
203
+ inner_dim = dim_head * heads
204
+ context_dim = default(context_dim, query_dim)
205
+
206
+ self.heads = heads
207
+ self.dim_head = dim_head
208
+
209
+ self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
210
+ self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
211
+ self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
212
+
213
+ self.to_out = nn.Sequential(nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
214
+ self.attention_op: Optional[Any] = None
215
+
216
+ def forward(self, x, context=None, mask=None):
217
+ q = self.to_q(x)
218
+ context = default(context, x)
219
+ k = self.to_k(context)
220
+ v = self.to_v(context)
221
+
222
+ b, _, _ = q.shape
223
+ q, k, v = map(
224
+ lambda t: t.unsqueeze(3)
225
+ .reshape(b, t.shape[1], self.heads, self.dim_head)
226
+ .permute(0, 2, 1, 3)
227
+ .reshape(b * self.heads, t.shape[1], self.dim_head)
228
+ .contiguous(),
229
+ (q, k, v),
230
+ )
231
+
232
+ # actually compute the attention, what we cannot get enough of
233
+ out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None, op=self.attention_op)
234
+
235
+ if exists(mask):
236
+ raise NotImplementedError
237
+ out = (
238
+ out.unsqueeze(0)
239
+ .reshape(b, self.heads, out.shape[1], self.dim_head)
240
+ .permute(0, 2, 1, 3)
241
+ .reshape(b, out.shape[1], self.heads * self.dim_head)
242
+ )
243
+ return self.to_out(out)
244
+
245
+
246
+ class BasicTransformerBlock(nn.Module):
247
+ ATTENTION_MODES = {
248
+ "softmax": CrossAttention, # vanilla attention
249
+ "softmax-xformers": MemoryEfficientCrossAttention
250
+ }
251
+ def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True,
252
+ disable_self_attn=False):
253
+ super().__init__()
254
+ attn_mode = "softmax-xformers" if XFORMERS_IS_AVAILBLE else "softmax"
255
+ assert attn_mode in self.ATTENTION_MODES
256
+ attn_cls = self.ATTENTION_MODES[attn_mode]
257
+ self.disable_self_attn = disable_self_attn
258
+ self.attn1 = attn_cls(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout,
259
+ context_dim=context_dim if self.disable_self_attn else None) # is a self-attention if not self.disable_self_attn
260
+ self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
261
+ self.attn2 = attn_cls(query_dim=dim, context_dim=context_dim,
262
+ heads=n_heads, dim_head=d_head, dropout=dropout) # is self-attn if context is none
263
+ self.norm1 = nn.LayerNorm(dim)
264
+ self.norm2 = nn.LayerNorm(dim)
265
+ self.norm3 = nn.LayerNorm(dim)
266
+ self.checkpoint = checkpoint
267
+
268
+ def forward(self, x, context=None):
269
+ return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
270
+
271
+ def _forward(self, x, context=None):
272
+ x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None) + x
273
+ x = self.attn2(self.norm2(x), context=context) + x
274
+ x = self.ff(self.norm3(x)) + x
275
+ return x
276
+
277
+
278
+ class SpatialTransformer(nn.Module):
279
+ """
280
+ Transformer block for image-like data.
281
+ First, project the input (aka embedding)
282
+ and reshape to b, t, d.
283
+ Then apply standard transformer action.
284
+ Finally, reshape to image
285
+ NEW: use_linear for more efficiency instead of the 1x1 convs
286
+ """
287
+ def __init__(self, in_channels, n_heads, d_head,
288
+ depth=1, dropout=0., context_dim=None,
289
+ disable_self_attn=False, use_linear=False,
290
+ use_checkpoint=True):
291
+ super().__init__()
292
+ if exists(context_dim) and not isinstance(context_dim, list):
293
+ context_dim = [context_dim]
294
+ self.in_channels = in_channels
295
+ inner_dim = n_heads * d_head
296
+ self.norm = Normalize(in_channels)
297
+ if not use_linear:
298
+ self.proj_in = nn.Conv2d(in_channels,
299
+ inner_dim,
300
+ kernel_size=1,
301
+ stride=1,
302
+ padding=0)
303
+ else:
304
+ self.proj_in = nn.Linear(in_channels, inner_dim)
305
+
306
+ self.transformer_blocks = nn.ModuleList(
307
+ [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim[d],
308
+ disable_self_attn=disable_self_attn, checkpoint=use_checkpoint)
309
+ for d in range(depth)]
310
+ )
311
+ if not use_linear:
312
+ self.proj_out = zero_module(nn.Conv2d(inner_dim,
313
+ in_channels,
314
+ kernel_size=1,
315
+ stride=1,
316
+ padding=0))
317
+ else:
318
+ self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
319
+ self.use_linear = use_linear
320
+
321
+ def forward(self, x, context=None):
322
+ # note: if no context is given, cross-attention defaults to self-attention
323
+ if not isinstance(context, list):
324
+ context = [context]
325
+ b, c, h, w = x.shape
326
+ x_in = x
327
+ x = self.norm(x)
328
+ if not self.use_linear:
329
+ x = self.proj_in(x)
330
+ x = rearrange(x, 'b c h w -> b (h w) c').contiguous()
331
+ if self.use_linear:
332
+ x = self.proj_in(x)
333
+ for i, block in enumerate(self.transformer_blocks):
334
+ x = block(x, context=context[i])
335
+ if self.use_linear:
336
+ x = self.proj_out(x)
337
+ x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
338
+ if not self.use_linear:
339
+ x = self.proj_out(x)
340
+ return x + x_in
341
+
ldm/modules/diffusionmodules/__init__.py ADDED
File without changes
ldm/modules/diffusionmodules/model.py ADDED
@@ -0,0 +1,852 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pytorch_diffusion + derived encoder decoder
2
+ import math
3
+ import torch
4
+ import torch.nn as nn
5
+ import numpy as np
6
+ from einops import rearrange
7
+ from typing import Optional, Any
8
+
9
+ from ldm.modules.attention import MemoryEfficientCrossAttention
10
+
11
+ try:
12
+ import xformers
13
+ import xformers.ops
14
+ XFORMERS_IS_AVAILBLE = True
15
+ except:
16
+ XFORMERS_IS_AVAILBLE = False
17
+ print("No module 'xformers'. Proceeding without it.")
18
+
19
+
20
+ def get_timestep_embedding(timesteps, embedding_dim):
21
+ """
22
+ This matches the implementation in Denoising Diffusion Probabilistic Models:
23
+ From Fairseq.
24
+ Build sinusoidal embeddings.
25
+ This matches the implementation in tensor2tensor, but differs slightly
26
+ from the description in Section 3.5 of "Attention Is All You Need".
27
+ """
28
+ assert len(timesteps.shape) == 1
29
+
30
+ half_dim = embedding_dim // 2
31
+ emb = math.log(10000) / (half_dim - 1)
32
+ emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
33
+ emb = emb.to(device=timesteps.device)
34
+ emb = timesteps.float()[:, None] * emb[None, :]
35
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
36
+ if embedding_dim % 2 == 1: # zero pad
37
+ emb = torch.nn.functional.pad(emb, (0,1,0,0))
38
+ return emb
39
+
40
+
41
+ def nonlinearity(x):
42
+ # swish
43
+ return x*torch.sigmoid(x)
44
+
45
+
46
+ def Normalize(in_channels, num_groups=32):
47
+ return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
48
+
49
+
50
+ class Upsample(nn.Module):
51
+ def __init__(self, in_channels, with_conv):
52
+ super().__init__()
53
+ self.with_conv = with_conv
54
+ if self.with_conv:
55
+ self.conv = torch.nn.Conv2d(in_channels,
56
+ in_channels,
57
+ kernel_size=3,
58
+ stride=1,
59
+ padding=1)
60
+
61
+ def forward(self, x):
62
+ x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
63
+ if self.with_conv:
64
+ x = self.conv(x)
65
+ return x
66
+
67
+
68
+ class Downsample(nn.Module):
69
+ def __init__(self, in_channels, with_conv):
70
+ super().__init__()
71
+ self.with_conv = with_conv
72
+ if self.with_conv:
73
+ # no asymmetric padding in torch conv, must do it ourselves
74
+ self.conv = torch.nn.Conv2d(in_channels,
75
+ in_channels,
76
+ kernel_size=3,
77
+ stride=2,
78
+ padding=0)
79
+
80
+ def forward(self, x):
81
+ if self.with_conv:
82
+ pad = (0,1,0,1)
83
+ x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
84
+ x = self.conv(x)
85
+ else:
86
+ x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
87
+ return x
88
+
89
+
90
+ class ResnetBlock(nn.Module):
91
+ def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
92
+ dropout, temb_channels=512):
93
+ super().__init__()
94
+ self.in_channels = in_channels
95
+ out_channels = in_channels if out_channels is None else out_channels
96
+ self.out_channels = out_channels
97
+ self.use_conv_shortcut = conv_shortcut
98
+
99
+ self.norm1 = Normalize(in_channels)
100
+ self.conv1 = torch.nn.Conv2d(in_channels,
101
+ out_channels,
102
+ kernel_size=3,
103
+ stride=1,
104
+ padding=1)
105
+ if temb_channels > 0:
106
+ self.temb_proj = torch.nn.Linear(temb_channels,
107
+ out_channels)
108
+ self.norm2 = Normalize(out_channels)
109
+ self.dropout = torch.nn.Dropout(dropout)
110
+ self.conv2 = torch.nn.Conv2d(out_channels,
111
+ out_channels,
112
+ kernel_size=3,
113
+ stride=1,
114
+ padding=1)
115
+ if self.in_channels != self.out_channels:
116
+ if self.use_conv_shortcut:
117
+ self.conv_shortcut = torch.nn.Conv2d(in_channels,
118
+ out_channels,
119
+ kernel_size=3,
120
+ stride=1,
121
+ padding=1)
122
+ else:
123
+ self.nin_shortcut = torch.nn.Conv2d(in_channels,
124
+ out_channels,
125
+ kernel_size=1,
126
+ stride=1,
127
+ padding=0)
128
+
129
+ def forward(self, x, temb):
130
+ h = x
131
+ h = self.norm1(h)
132
+ h = nonlinearity(h)
133
+ h = self.conv1(h)
134
+
135
+ if temb is not None:
136
+ h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
137
+
138
+ h = self.norm2(h)
139
+ h = nonlinearity(h)
140
+ h = self.dropout(h)
141
+ h = self.conv2(h)
142
+
143
+ if self.in_channels != self.out_channels:
144
+ if self.use_conv_shortcut:
145
+ x = self.conv_shortcut(x)
146
+ else:
147
+ x = self.nin_shortcut(x)
148
+
149
+ return x+h
150
+
151
+
152
+ class AttnBlock(nn.Module):
153
+ def __init__(self, in_channels):
154
+ super().__init__()
155
+ self.in_channels = in_channels
156
+
157
+ self.norm = Normalize(in_channels)
158
+ self.q = torch.nn.Conv2d(in_channels,
159
+ in_channels,
160
+ kernel_size=1,
161
+ stride=1,
162
+ padding=0)
163
+ self.k = torch.nn.Conv2d(in_channels,
164
+ in_channels,
165
+ kernel_size=1,
166
+ stride=1,
167
+ padding=0)
168
+ self.v = torch.nn.Conv2d(in_channels,
169
+ in_channels,
170
+ kernel_size=1,
171
+ stride=1,
172
+ padding=0)
173
+ self.proj_out = torch.nn.Conv2d(in_channels,
174
+ in_channels,
175
+ kernel_size=1,
176
+ stride=1,
177
+ padding=0)
178
+
179
+ def forward(self, x):
180
+ h_ = x
181
+ h_ = self.norm(h_)
182
+ q = self.q(h_)
183
+ k = self.k(h_)
184
+ v = self.v(h_)
185
+
186
+ # compute attention
187
+ b,c,h,w = q.shape
188
+ q = q.reshape(b,c,h*w)
189
+ q = q.permute(0,2,1) # b,hw,c
190
+ k = k.reshape(b,c,h*w) # b,c,hw
191
+ w_ = torch.bmm(q,k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
192
+ w_ = w_ * (int(c)**(-0.5))
193
+ w_ = torch.nn.functional.softmax(w_, dim=2)
194
+
195
+ # attend to values
196
+ v = v.reshape(b,c,h*w)
197
+ w_ = w_.permute(0,2,1) # b,hw,hw (first hw of k, second of q)
198
+ h_ = torch.bmm(v,w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
199
+ h_ = h_.reshape(b,c,h,w)
200
+
201
+ h_ = self.proj_out(h_)
202
+
203
+ return x+h_
204
+
205
+ class MemoryEfficientAttnBlock(nn.Module):
206
+ """
207
+ Uses xformers efficient implementation,
208
+ see https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
209
+ Note: this is a single-head self-attention operation
210
+ """
211
+ #
212
+ def __init__(self, in_channels):
213
+ super().__init__()
214
+ self.in_channels = in_channels
215
+
216
+ self.norm = Normalize(in_channels)
217
+ self.q = torch.nn.Conv2d(in_channels,
218
+ in_channels,
219
+ kernel_size=1,
220
+ stride=1,
221
+ padding=0)
222
+ self.k = torch.nn.Conv2d(in_channels,
223
+ in_channels,
224
+ kernel_size=1,
225
+ stride=1,
226
+ padding=0)
227
+ self.v = torch.nn.Conv2d(in_channels,
228
+ in_channels,
229
+ kernel_size=1,
230
+ stride=1,
231
+ padding=0)
232
+ self.proj_out = torch.nn.Conv2d(in_channels,
233
+ in_channels,
234
+ kernel_size=1,
235
+ stride=1,
236
+ padding=0)
237
+ self.attention_op: Optional[Any] = None
238
+
239
+ def forward(self, x):
240
+ h_ = x
241
+ h_ = self.norm(h_)
242
+ q = self.q(h_)
243
+ k = self.k(h_)
244
+ v = self.v(h_)
245
+
246
+ # compute attention
247
+ B, C, H, W = q.shape
248
+ q, k, v = map(lambda x: rearrange(x, 'b c h w -> b (h w) c'), (q, k, v))
249
+
250
+ q, k, v = map(
251
+ lambda t: t.unsqueeze(3)
252
+ .reshape(B, t.shape[1], 1, C)
253
+ .permute(0, 2, 1, 3)
254
+ .reshape(B * 1, t.shape[1], C)
255
+ .contiguous(),
256
+ (q, k, v),
257
+ )
258
+ out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None, op=self.attention_op)
259
+
260
+ out = (
261
+ out.unsqueeze(0)
262
+ .reshape(B, 1, out.shape[1], C)
263
+ .permute(0, 2, 1, 3)
264
+ .reshape(B, out.shape[1], C)
265
+ )
266
+ out = rearrange(out, 'b (h w) c -> b c h w', b=B, h=H, w=W, c=C)
267
+ out = self.proj_out(out)
268
+ return x+out
269
+
270
+
271
+ class MemoryEfficientCrossAttentionWrapper(MemoryEfficientCrossAttention):
272
+ def forward(self, x, context=None, mask=None):
273
+ b, c, h, w = x.shape
274
+ x = rearrange(x, 'b c h w -> b (h w) c')
275
+ out = super().forward(x, context=context, mask=mask)
276
+ out = rearrange(out, 'b (h w) c -> b c h w', h=h, w=w, c=c)
277
+ return x + out
278
+
279
+
280
+ def make_attn(in_channels, attn_type="vanilla", attn_kwargs=None):
281
+ assert attn_type in ["vanilla", "vanilla-xformers", "memory-efficient-cross-attn", "linear", "none"], f'attn_type {attn_type} unknown'
282
+ if XFORMERS_IS_AVAILBLE and attn_type == "vanilla":
283
+ attn_type = "vanilla-xformers"
284
+ print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
285
+ if attn_type == "vanilla":
286
+ assert attn_kwargs is None
287
+ return AttnBlock(in_channels)
288
+ elif attn_type == "vanilla-xformers":
289
+ print(f"building MemoryEfficientAttnBlock with {in_channels} in_channels...")
290
+ return MemoryEfficientAttnBlock(in_channels)
291
+ elif type == "memory-efficient-cross-attn":
292
+ attn_kwargs["query_dim"] = in_channels
293
+ return MemoryEfficientCrossAttentionWrapper(**attn_kwargs)
294
+ elif attn_type == "none":
295
+ return nn.Identity(in_channels)
296
+ else:
297
+ raise NotImplementedError()
298
+
299
+
300
+ class Model(nn.Module):
301
+ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
302
+ attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
303
+ resolution, use_timestep=True, use_linear_attn=False, attn_type="vanilla"):
304
+ super().__init__()
305
+ if use_linear_attn: attn_type = "linear"
306
+ self.ch = ch
307
+ self.temb_ch = self.ch*4
308
+ self.num_resolutions = len(ch_mult)
309
+ self.num_res_blocks = num_res_blocks
310
+ self.resolution = resolution
311
+ self.in_channels = in_channels
312
+
313
+ self.use_timestep = use_timestep
314
+ if self.use_timestep:
315
+ # timestep embedding
316
+ self.temb = nn.Module()
317
+ self.temb.dense = nn.ModuleList([
318
+ torch.nn.Linear(self.ch,
319
+ self.temb_ch),
320
+ torch.nn.Linear(self.temb_ch,
321
+ self.temb_ch),
322
+ ])
323
+
324
+ # downsampling
325
+ self.conv_in = torch.nn.Conv2d(in_channels,
326
+ self.ch,
327
+ kernel_size=3,
328
+ stride=1,
329
+ padding=1)
330
+
331
+ curr_res = resolution
332
+ in_ch_mult = (1,)+tuple(ch_mult)
333
+ self.down = nn.ModuleList()
334
+ for i_level in range(self.num_resolutions):
335
+ block = nn.ModuleList()
336
+ attn = nn.ModuleList()
337
+ block_in = ch*in_ch_mult[i_level]
338
+ block_out = ch*ch_mult[i_level]
339
+ for i_block in range(self.num_res_blocks):
340
+ block.append(ResnetBlock(in_channels=block_in,
341
+ out_channels=block_out,
342
+ temb_channels=self.temb_ch,
343
+ dropout=dropout))
344
+ block_in = block_out
345
+ if curr_res in attn_resolutions:
346
+ attn.append(make_attn(block_in, attn_type=attn_type))
347
+ down = nn.Module()
348
+ down.block = block
349
+ down.attn = attn
350
+ if i_level != self.num_resolutions-1:
351
+ down.downsample = Downsample(block_in, resamp_with_conv)
352
+ curr_res = curr_res // 2
353
+ self.down.append(down)
354
+
355
+ # middle
356
+ self.mid = nn.Module()
357
+ self.mid.block_1 = ResnetBlock(in_channels=block_in,
358
+ out_channels=block_in,
359
+ temb_channels=self.temb_ch,
360
+ dropout=dropout)
361
+ self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
362
+ self.mid.block_2 = ResnetBlock(in_channels=block_in,
363
+ out_channels=block_in,
364
+ temb_channels=self.temb_ch,
365
+ dropout=dropout)
366
+
367
+ # upsampling
368
+ self.up = nn.ModuleList()
369
+ for i_level in reversed(range(self.num_resolutions)):
370
+ block = nn.ModuleList()
371
+ attn = nn.ModuleList()
372
+ block_out = ch*ch_mult[i_level]
373
+ skip_in = ch*ch_mult[i_level]
374
+ for i_block in range(self.num_res_blocks+1):
375
+ if i_block == self.num_res_blocks:
376
+ skip_in = ch*in_ch_mult[i_level]
377
+ block.append(ResnetBlock(in_channels=block_in+skip_in,
378
+ out_channels=block_out,
379
+ temb_channels=self.temb_ch,
380
+ dropout=dropout))
381
+ block_in = block_out
382
+ if curr_res in attn_resolutions:
383
+ attn.append(make_attn(block_in, attn_type=attn_type))
384
+ up = nn.Module()
385
+ up.block = block
386
+ up.attn = attn
387
+ if i_level != 0:
388
+ up.upsample = Upsample(block_in, resamp_with_conv)
389
+ curr_res = curr_res * 2
390
+ self.up.insert(0, up) # prepend to get consistent order
391
+
392
+ # end
393
+ self.norm_out = Normalize(block_in)
394
+ self.conv_out = torch.nn.Conv2d(block_in,
395
+ out_ch,
396
+ kernel_size=3,
397
+ stride=1,
398
+ padding=1)
399
+
400
+ def forward(self, x, t=None, context=None):
401
+ #assert x.shape[2] == x.shape[3] == self.resolution
402
+ if context is not None:
403
+ # assume aligned context, cat along channel axis
404
+ x = torch.cat((x, context), dim=1)
405
+ if self.use_timestep:
406
+ # timestep embedding
407
+ assert t is not None
408
+ temb = get_timestep_embedding(t, self.ch)
409
+ temb = self.temb.dense[0](temb)
410
+ temb = nonlinearity(temb)
411
+ temb = self.temb.dense[1](temb)
412
+ else:
413
+ temb = None
414
+
415
+ # downsampling
416
+ hs = [self.conv_in(x)]
417
+ for i_level in range(self.num_resolutions):
418
+ for i_block in range(self.num_res_blocks):
419
+ h = self.down[i_level].block[i_block](hs[-1], temb)
420
+ if len(self.down[i_level].attn) > 0:
421
+ h = self.down[i_level].attn[i_block](h)
422
+ hs.append(h)
423
+ if i_level != self.num_resolutions-1:
424
+ hs.append(self.down[i_level].downsample(hs[-1]))
425
+
426
+ # middle
427
+ h = hs[-1]
428
+ h = self.mid.block_1(h, temb)
429
+ h = self.mid.attn_1(h)
430
+ h = self.mid.block_2(h, temb)
431
+
432
+ # upsampling
433
+ for i_level in reversed(range(self.num_resolutions)):
434
+ for i_block in range(self.num_res_blocks+1):
435
+ h = self.up[i_level].block[i_block](
436
+ torch.cat([h, hs.pop()], dim=1), temb)
437
+ if len(self.up[i_level].attn) > 0:
438
+ h = self.up[i_level].attn[i_block](h)
439
+ if i_level != 0:
440
+ h = self.up[i_level].upsample(h)
441
+
442
+ # end
443
+ h = self.norm_out(h)
444
+ h = nonlinearity(h)
445
+ h = self.conv_out(h)
446
+ return h
447
+
448
+ def get_last_layer(self):
449
+ return self.conv_out.weight
450
+
451
+
452
+ class Encoder(nn.Module):
453
+ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
454
+ attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
455
+ resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="vanilla",
456
+ **ignore_kwargs):
457
+ super().__init__()
458
+ if use_linear_attn: attn_type = "linear"
459
+ self.ch = ch
460
+ self.temb_ch = 0
461
+ self.num_resolutions = len(ch_mult)
462
+ self.num_res_blocks = num_res_blocks
463
+ self.resolution = resolution
464
+ self.in_channels = in_channels
465
+
466
+ # downsampling
467
+ self.conv_in = torch.nn.Conv2d(in_channels,
468
+ self.ch,
469
+ kernel_size=3,
470
+ stride=1,
471
+ padding=1)
472
+
473
+ curr_res = resolution
474
+ in_ch_mult = (1,)+tuple(ch_mult)
475
+ self.in_ch_mult = in_ch_mult
476
+ self.down = nn.ModuleList()
477
+ for i_level in range(self.num_resolutions):
478
+ block = nn.ModuleList()
479
+ attn = nn.ModuleList()
480
+ block_in = ch*in_ch_mult[i_level]
481
+ block_out = ch*ch_mult[i_level]
482
+ for i_block in range(self.num_res_blocks):
483
+ block.append(ResnetBlock(in_channels=block_in,
484
+ out_channels=block_out,
485
+ temb_channels=self.temb_ch,
486
+ dropout=dropout))
487
+ block_in = block_out
488
+ if curr_res in attn_resolutions:
489
+ attn.append(make_attn(block_in, attn_type=attn_type))
490
+ down = nn.Module()
491
+ down.block = block
492
+ down.attn = attn
493
+ if i_level != self.num_resolutions-1:
494
+ down.downsample = Downsample(block_in, resamp_with_conv)
495
+ curr_res = curr_res // 2
496
+ self.down.append(down)
497
+
498
+ # middle
499
+ self.mid = nn.Module()
500
+ self.mid.block_1 = ResnetBlock(in_channels=block_in,
501
+ out_channels=block_in,
502
+ temb_channels=self.temb_ch,
503
+ dropout=dropout)
504
+ self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
505
+ self.mid.block_2 = ResnetBlock(in_channels=block_in,
506
+ out_channels=block_in,
507
+ temb_channels=self.temb_ch,
508
+ dropout=dropout)
509
+
510
+ # end
511
+ self.norm_out = Normalize(block_in)
512
+ self.conv_out = torch.nn.Conv2d(block_in,
513
+ 2*z_channels if double_z else z_channels,
514
+ kernel_size=3,
515
+ stride=1,
516
+ padding=1)
517
+
518
+ def forward(self, x):
519
+ # timestep embedding
520
+ temb = None
521
+
522
+ # downsampling
523
+ hs = [self.conv_in(x)]
524
+ for i_level in range(self.num_resolutions):
525
+ for i_block in range(self.num_res_blocks):
526
+ h = self.down[i_level].block[i_block](hs[-1], temb)
527
+ if len(self.down[i_level].attn) > 0:
528
+ h = self.down[i_level].attn[i_block](h)
529
+ hs.append(h)
530
+ if i_level != self.num_resolutions-1:
531
+ hs.append(self.down[i_level].downsample(hs[-1]))
532
+
533
+ # middle
534
+ h = hs[-1]
535
+ h = self.mid.block_1(h, temb)
536
+ h = self.mid.attn_1(h)
537
+ h = self.mid.block_2(h, temb)
538
+
539
+ # end
540
+ h = self.norm_out(h)
541
+ h = nonlinearity(h)
542
+ h = self.conv_out(h)
543
+ return h
544
+
545
+
546
+ class Decoder(nn.Module):
547
+ def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
548
+ attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
549
+ resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False,
550
+ attn_type="vanilla", **ignorekwargs):
551
+ super().__init__()
552
+ if use_linear_attn: attn_type = "linear"
553
+ self.ch = ch
554
+ self.temb_ch = 0
555
+ self.num_resolutions = len(ch_mult)
556
+ self.num_res_blocks = num_res_blocks
557
+ self.resolution = resolution
558
+ self.in_channels = in_channels
559
+ self.give_pre_end = give_pre_end
560
+ self.tanh_out = tanh_out
561
+
562
+ # compute in_ch_mult, block_in and curr_res at lowest res
563
+ in_ch_mult = (1,)+tuple(ch_mult)
564
+ block_in = ch*ch_mult[self.num_resolutions-1]
565
+ curr_res = resolution // 2**(self.num_resolutions-1)
566
+ self.z_shape = (1,z_channels,curr_res,curr_res)
567
+ print("Working with z of shape {} = {} dimensions.".format(
568
+ self.z_shape, np.prod(self.z_shape)))
569
+
570
+ # z to block_in
571
+ self.conv_in = torch.nn.Conv2d(z_channels,
572
+ block_in,
573
+ kernel_size=3,
574
+ stride=1,
575
+ padding=1)
576
+
577
+ # middle
578
+ self.mid = nn.Module()
579
+ self.mid.block_1 = ResnetBlock(in_channels=block_in,
580
+ out_channels=block_in,
581
+ temb_channels=self.temb_ch,
582
+ dropout=dropout)
583
+ self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
584
+ self.mid.block_2 = ResnetBlock(in_channels=block_in,
585
+ out_channels=block_in,
586
+ temb_channels=self.temb_ch,
587
+ dropout=dropout)
588
+
589
+ # upsampling
590
+ self.up = nn.ModuleList()
591
+ for i_level in reversed(range(self.num_resolutions)):
592
+ block = nn.ModuleList()
593
+ attn = nn.ModuleList()
594
+ block_out = ch*ch_mult[i_level]
595
+ for i_block in range(self.num_res_blocks+1):
596
+ block.append(ResnetBlock(in_channels=block_in,
597
+ out_channels=block_out,
598
+ temb_channels=self.temb_ch,
599
+ dropout=dropout))
600
+ block_in = block_out
601
+ if curr_res in attn_resolutions:
602
+ attn.append(make_attn(block_in, attn_type=attn_type))
603
+ up = nn.Module()
604
+ up.block = block
605
+ up.attn = attn
606
+ if i_level != 0:
607
+ up.upsample = Upsample(block_in, resamp_with_conv)
608
+ curr_res = curr_res * 2
609
+ self.up.insert(0, up) # prepend to get consistent order
610
+
611
+ # end
612
+ self.norm_out = Normalize(block_in)
613
+ self.conv_out = torch.nn.Conv2d(block_in,
614
+ out_ch,
615
+ kernel_size=3,
616
+ stride=1,
617
+ padding=1)
618
+
619
+ def forward(self, z):
620
+ #assert z.shape[1:] == self.z_shape[1:]
621
+ self.last_z_shape = z.shape
622
+
623
+ # timestep embedding
624
+ temb = None
625
+
626
+ # z to block_in
627
+ h = self.conv_in(z)
628
+
629
+ # middle
630
+ h = self.mid.block_1(h, temb)
631
+ h = self.mid.attn_1(h)
632
+ h = self.mid.block_2(h, temb)
633
+
634
+ # upsampling
635
+ for i_level in reversed(range(self.num_resolutions)):
636
+ for i_block in range(self.num_res_blocks+1):
637
+ h = self.up[i_level].block[i_block](h, temb)
638
+ if len(self.up[i_level].attn) > 0:
639
+ h = self.up[i_level].attn[i_block](h)
640
+ if i_level != 0:
641
+ h = self.up[i_level].upsample(h)
642
+
643
+ # end
644
+ if self.give_pre_end:
645
+ return h
646
+
647
+ h = self.norm_out(h)
648
+ h = nonlinearity(h)
649
+ h = self.conv_out(h)
650
+ if self.tanh_out:
651
+ h = torch.tanh(h)
652
+ return h
653
+
654
+
655
+ class SimpleDecoder(nn.Module):
656
+ def __init__(self, in_channels, out_channels, *args, **kwargs):
657
+ super().__init__()
658
+ self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1),
659
+ ResnetBlock(in_channels=in_channels,
660
+ out_channels=2 * in_channels,
661
+ temb_channels=0, dropout=0.0),
662
+ ResnetBlock(in_channels=2 * in_channels,
663
+ out_channels=4 * in_channels,
664
+ temb_channels=0, dropout=0.0),
665
+ ResnetBlock(in_channels=4 * in_channels,
666
+ out_channels=2 * in_channels,
667
+ temb_channels=0, dropout=0.0),
668
+ nn.Conv2d(2*in_channels, in_channels, 1),
669
+ Upsample(in_channels, with_conv=True)])
670
+ # end
671
+ self.norm_out = Normalize(in_channels)
672
+ self.conv_out = torch.nn.Conv2d(in_channels,
673
+ out_channels,
674
+ kernel_size=3,
675
+ stride=1,
676
+ padding=1)
677
+
678
+ def forward(self, x):
679
+ for i, layer in enumerate(self.model):
680
+ if i in [1,2,3]:
681
+ x = layer(x, None)
682
+ else:
683
+ x = layer(x)
684
+
685
+ h = self.norm_out(x)
686
+ h = nonlinearity(h)
687
+ x = self.conv_out(h)
688
+ return x
689
+
690
+
691
+ class UpsampleDecoder(nn.Module):
692
+ def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution,
693
+ ch_mult=(2,2), dropout=0.0):
694
+ super().__init__()
695
+ # upsampling
696
+ self.temb_ch = 0
697
+ self.num_resolutions = len(ch_mult)
698
+ self.num_res_blocks = num_res_blocks
699
+ block_in = in_channels
700
+ curr_res = resolution // 2 ** (self.num_resolutions - 1)
701
+ self.res_blocks = nn.ModuleList()
702
+ self.upsample_blocks = nn.ModuleList()
703
+ for i_level in range(self.num_resolutions):
704
+ res_block = []
705
+ block_out = ch * ch_mult[i_level]
706
+ for i_block in range(self.num_res_blocks + 1):
707
+ res_block.append(ResnetBlock(in_channels=block_in,
708
+ out_channels=block_out,
709
+ temb_channels=self.temb_ch,
710
+ dropout=dropout))
711
+ block_in = block_out
712
+ self.res_blocks.append(nn.ModuleList(res_block))
713
+ if i_level != self.num_resolutions - 1:
714
+ self.upsample_blocks.append(Upsample(block_in, True))
715
+ curr_res = curr_res * 2
716
+
717
+ # end
718
+ self.norm_out = Normalize(block_in)
719
+ self.conv_out = torch.nn.Conv2d(block_in,
720
+ out_channels,
721
+ kernel_size=3,
722
+ stride=1,
723
+ padding=1)
724
+
725
+ def forward(self, x):
726
+ # upsampling
727
+ h = x
728
+ for k, i_level in enumerate(range(self.num_resolutions)):
729
+ for i_block in range(self.num_res_blocks + 1):
730
+ h = self.res_blocks[i_level][i_block](h, None)
731
+ if i_level != self.num_resolutions - 1:
732
+ h = self.upsample_blocks[k](h)
733
+ h = self.norm_out(h)
734
+ h = nonlinearity(h)
735
+ h = self.conv_out(h)
736
+ return h
737
+
738
+
739
+ class LatentRescaler(nn.Module):
740
+ def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2):
741
+ super().__init__()
742
+ # residual block, interpolate, residual block
743
+ self.factor = factor
744
+ self.conv_in = nn.Conv2d(in_channels,
745
+ mid_channels,
746
+ kernel_size=3,
747
+ stride=1,
748
+ padding=1)
749
+ self.res_block1 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
750
+ out_channels=mid_channels,
751
+ temb_channels=0,
752
+ dropout=0.0) for _ in range(depth)])
753
+ self.attn = AttnBlock(mid_channels)
754
+ self.res_block2 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
755
+ out_channels=mid_channels,
756
+ temb_channels=0,
757
+ dropout=0.0) for _ in range(depth)])
758
+
759
+ self.conv_out = nn.Conv2d(mid_channels,
760
+ out_channels,
761
+ kernel_size=1,
762
+ )
763
+
764
+ def forward(self, x):
765
+ x = self.conv_in(x)
766
+ for block in self.res_block1:
767
+ x = block(x, None)
768
+ x = torch.nn.functional.interpolate(x, size=(int(round(x.shape[2]*self.factor)), int(round(x.shape[3]*self.factor))))
769
+ x = self.attn(x)
770
+ for block in self.res_block2:
771
+ x = block(x, None)
772
+ x = self.conv_out(x)
773
+ return x
774
+
775
+
776
+ class MergedRescaleEncoder(nn.Module):
777
+ def __init__(self, in_channels, ch, resolution, out_ch, num_res_blocks,
778
+ attn_resolutions, dropout=0.0, resamp_with_conv=True,
779
+ ch_mult=(1,2,4,8), rescale_factor=1.0, rescale_module_depth=1):
780
+ super().__init__()
781
+ intermediate_chn = ch * ch_mult[-1]
782
+ self.encoder = Encoder(in_channels=in_channels, num_res_blocks=num_res_blocks, ch=ch, ch_mult=ch_mult,
783
+ z_channels=intermediate_chn, double_z=False, resolution=resolution,
784
+ attn_resolutions=attn_resolutions, dropout=dropout, resamp_with_conv=resamp_with_conv,
785
+ out_ch=None)
786
+ self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=intermediate_chn,
787
+ mid_channels=intermediate_chn, out_channels=out_ch, depth=rescale_module_depth)
788
+
789
+ def forward(self, x):
790
+ x = self.encoder(x)
791
+ x = self.rescaler(x)
792
+ return x
793
+
794
+
795
+ class MergedRescaleDecoder(nn.Module):
796
+ def __init__(self, z_channels, out_ch, resolution, num_res_blocks, attn_resolutions, ch, ch_mult=(1,2,4,8),
797
+ dropout=0.0, resamp_with_conv=True, rescale_factor=1.0, rescale_module_depth=1):
798
+ super().__init__()
799
+ tmp_chn = z_channels*ch_mult[-1]
800
+ self.decoder = Decoder(out_ch=out_ch, z_channels=tmp_chn, attn_resolutions=attn_resolutions, dropout=dropout,
801
+ resamp_with_conv=resamp_with_conv, in_channels=None, num_res_blocks=num_res_blocks,
802
+ ch_mult=ch_mult, resolution=resolution, ch=ch)
803
+ self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=z_channels, mid_channels=tmp_chn,
804
+ out_channels=tmp_chn, depth=rescale_module_depth)
805
+
806
+ def forward(self, x):
807
+ x = self.rescaler(x)
808
+ x = self.decoder(x)
809
+ return x
810
+
811
+
812
+ class Upsampler(nn.Module):
813
+ def __init__(self, in_size, out_size, in_channels, out_channels, ch_mult=2):
814
+ super().__init__()
815
+ assert out_size >= in_size
816
+ num_blocks = int(np.log2(out_size//in_size))+1
817
+ factor_up = 1.+ (out_size % in_size)
818
+ print(f"Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}")
819
+ self.rescaler = LatentRescaler(factor=factor_up, in_channels=in_channels, mid_channels=2*in_channels,
820
+ out_channels=in_channels)
821
+ self.decoder = Decoder(out_ch=out_channels, resolution=out_size, z_channels=in_channels, num_res_blocks=2,
822
+ attn_resolutions=[], in_channels=None, ch=in_channels,
823
+ ch_mult=[ch_mult for _ in range(num_blocks)])
824
+
825
+ def forward(self, x):
826
+ x = self.rescaler(x)
827
+ x = self.decoder(x)
828
+ return x
829
+
830
+
831
+ class Resize(nn.Module):
832
+ def __init__(self, in_channels=None, learned=False, mode="bilinear"):
833
+ super().__init__()
834
+ self.with_conv = learned
835
+ self.mode = mode
836
+ if self.with_conv:
837
+ print(f"Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode")
838
+ raise NotImplementedError()
839
+ assert in_channels is not None
840
+ # no asymmetric padding in torch conv, must do it ourselves
841
+ self.conv = torch.nn.Conv2d(in_channels,
842
+ in_channels,
843
+ kernel_size=4,
844
+ stride=2,
845
+ padding=1)
846
+
847
+ def forward(self, x, scale_factor=1.0):
848
+ if scale_factor==1.0:
849
+ return x
850
+ else:
851
+ x = torch.nn.functional.interpolate(x, mode=self.mode, align_corners=False, scale_factor=scale_factor)
852
+ return x
ldm/modules/diffusionmodules/openaimodel.py ADDED
@@ -0,0 +1,807 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import abstractmethod
2
+ import math
3
+
4
+ import numpy as np
5
+ import torch as th
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+
9
+ from ldm.modules.diffusionmodules.util import (
10
+ checkpoint,
11
+ conv_nd,
12
+ linear,
13
+ avg_pool_nd,
14
+ zero_module,
15
+ normalization,
16
+ timestep_embedding,
17
+ )
18
+ from ldm.modules.attention import SpatialTransformer
19
+ from ldm.util import exists
20
+
21
+
22
+ # dummy replace
23
+ def convert_module_to_f16(x):
24
+ pass
25
+
26
+ def convert_module_to_f32(x):
27
+ pass
28
+
29
+
30
+ ## go
31
+ class AttentionPool2d(nn.Module):
32
+ """
33
+ Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ spacial_dim: int,
39
+ embed_dim: int,
40
+ num_heads_channels: int,
41
+ output_dim: int = None,
42
+ ):
43
+ super().__init__()
44
+ self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5)
45
+ self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
46
+ self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
47
+ self.num_heads = embed_dim // num_heads_channels
48
+ self.attention = QKVAttention(self.num_heads)
49
+
50
+ def forward(self, x):
51
+ b, c, *_spatial = x.shape
52
+ x = x.reshape(b, c, -1) # NC(HW)
53
+ x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1) # NC(HW+1)
54
+ x = x + self.positional_embedding[None, :, :].to(x.dtype) # NC(HW+1)
55
+ x = self.qkv_proj(x)
56
+ x = self.attention(x)
57
+ x = self.c_proj(x)
58
+ return x[:, :, 0]
59
+
60
+
61
+ class TimestepBlock(nn.Module):
62
+ """
63
+ Any module where forward() takes timestep embeddings as a second argument.
64
+ """
65
+
66
+ @abstractmethod
67
+ def forward(self, x, emb):
68
+ """
69
+ Apply the module to `x` given `emb` timestep embeddings.
70
+ """
71
+
72
+
73
+ class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
74
+ """
75
+ A sequential module that passes timestep embeddings to the children that
76
+ support it as an extra input.
77
+ """
78
+
79
+ def forward(self, x, emb, context=None):
80
+ for layer in self:
81
+ if isinstance(layer, TimestepBlock):
82
+ x = layer(x, emb)
83
+ elif isinstance(layer, SpatialTransformer):
84
+ x = layer(x, context)
85
+ else:
86
+ x = layer(x)
87
+ return x
88
+
89
+
90
+ class Upsample(nn.Module):
91
+ """
92
+ An upsampling layer with an optional convolution.
93
+ :param channels: channels in the inputs and outputs.
94
+ :param use_conv: a bool determining if a convolution is applied.
95
+ :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
96
+ upsampling occurs in the inner-two dimensions.
97
+ """
98
+
99
+ def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
100
+ super().__init__()
101
+ self.channels = channels
102
+ self.out_channels = out_channels or channels
103
+ self.use_conv = use_conv
104
+ self.dims = dims
105
+ if use_conv:
106
+ self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding)
107
+
108
+ def forward(self, x):
109
+ assert x.shape[1] == self.channels
110
+ if self.dims == 3:
111
+ x = F.interpolate(
112
+ x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
113
+ )
114
+ else:
115
+ x = F.interpolate(x, scale_factor=2, mode="nearest")
116
+ if self.use_conv:
117
+ x = self.conv(x)
118
+ return x
119
+
120
+ class TransposedUpsample(nn.Module):
121
+ 'Learned 2x upsampling without padding'
122
+ def __init__(self, channels, out_channels=None, ks=5):
123
+ super().__init__()
124
+ self.channels = channels
125
+ self.out_channels = out_channels or channels
126
+
127
+ self.up = nn.ConvTranspose2d(self.channels,self.out_channels,kernel_size=ks,stride=2)
128
+
129
+ def forward(self,x):
130
+ return self.up(x)
131
+
132
+
133
+ class Downsample(nn.Module):
134
+ """
135
+ A downsampling layer with an optional convolution.
136
+ :param channels: channels in the inputs and outputs.
137
+ :param use_conv: a bool determining if a convolution is applied.
138
+ :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
139
+ downsampling occurs in the inner-two dimensions.
140
+ """
141
+
142
+ def __init__(self, channels, use_conv, dims=2, out_channels=None,padding=1):
143
+ super().__init__()
144
+ self.channels = channels
145
+ self.out_channels = out_channels or channels
146
+ self.use_conv = use_conv
147
+ self.dims = dims
148
+ stride = 2 if dims != 3 else (1, 2, 2)
149
+ if use_conv:
150
+ self.op = conv_nd(
151
+ dims, self.channels, self.out_channels, 3, stride=stride, padding=padding
152
+ )
153
+ else:
154
+ assert self.channels == self.out_channels
155
+ self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
156
+
157
+ def forward(self, x):
158
+ assert x.shape[1] == self.channels
159
+ return self.op(x)
160
+
161
+
162
+ class ResBlock(TimestepBlock):
163
+ """
164
+ A residual block that can optionally change the number of channels.
165
+ :param channels: the number of input channels.
166
+ :param emb_channels: the number of timestep embedding channels.
167
+ :param dropout: the rate of dropout.
168
+ :param out_channels: if specified, the number of out channels.
169
+ :param use_conv: if True and out_channels is specified, use a spatial
170
+ convolution instead of a smaller 1x1 convolution to change the
171
+ channels in the skip connection.
172
+ :param dims: determines if the signal is 1D, 2D, or 3D.
173
+ :param use_checkpoint: if True, use gradient checkpointing on this module.
174
+ :param up: if True, use this block for upsampling.
175
+ :param down: if True, use this block for downsampling.
176
+ """
177
+
178
+ def __init__(
179
+ self,
180
+ channels,
181
+ emb_channels,
182
+ dropout,
183
+ out_channels=None,
184
+ use_conv=False,
185
+ use_scale_shift_norm=False,
186
+ dims=2,
187
+ use_checkpoint=False,
188
+ up=False,
189
+ down=False,
190
+ ):
191
+ super().__init__()
192
+ self.channels = channels
193
+ self.emb_channels = emb_channels
194
+ self.dropout = dropout
195
+ self.out_channels = out_channels or channels
196
+ self.use_conv = use_conv
197
+ self.use_checkpoint = use_checkpoint
198
+ self.use_scale_shift_norm = use_scale_shift_norm
199
+
200
+ self.in_layers = nn.Sequential(
201
+ normalization(channels),
202
+ nn.SiLU(),
203
+ conv_nd(dims, channels, self.out_channels, 3, padding=1),
204
+ )
205
+
206
+ self.updown = up or down
207
+
208
+ if up:
209
+ self.h_upd = Upsample(channels, False, dims)
210
+ self.x_upd = Upsample(channels, False, dims)
211
+ elif down:
212
+ self.h_upd = Downsample(channels, False, dims)
213
+ self.x_upd = Downsample(channels, False, dims)
214
+ else:
215
+ self.h_upd = self.x_upd = nn.Identity()
216
+
217
+ self.emb_layers = nn.Sequential(
218
+ nn.SiLU(),
219
+ linear(
220
+ emb_channels,
221
+ 2 * self.out_channels if use_scale_shift_norm else self.out_channels,
222
+ ),
223
+ )
224
+ self.out_layers = nn.Sequential(
225
+ normalization(self.out_channels),
226
+ nn.SiLU(),
227
+ nn.Dropout(p=dropout),
228
+ zero_module(
229
+ conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
230
+ ),
231
+ )
232
+
233
+ if self.out_channels == channels:
234
+ self.skip_connection = nn.Identity()
235
+ elif use_conv:
236
+ self.skip_connection = conv_nd(
237
+ dims, channels, self.out_channels, 3, padding=1
238
+ )
239
+ else:
240
+ self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
241
+
242
+ def forward(self, x, emb):
243
+ """
244
+ Apply the block to a Tensor, conditioned on a timestep embedding.
245
+ :param x: an [N x C x ...] Tensor of features.
246
+ :param emb: an [N x emb_channels] Tensor of timestep embeddings.
247
+ :return: an [N x C x ...] Tensor of outputs.
248
+ """
249
+ return checkpoint(
250
+ self._forward, (x, emb), self.parameters(), self.use_checkpoint
251
+ )
252
+
253
+
254
+ def _forward(self, x, emb):
255
+ if self.updown:
256
+ in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
257
+ h = in_rest(x)
258
+ h = self.h_upd(h)
259
+ x = self.x_upd(x)
260
+ h = in_conv(h)
261
+ else:
262
+ h = self.in_layers(x)
263
+ emb_out = self.emb_layers(emb).type(h.dtype)
264
+ while len(emb_out.shape) < len(h.shape):
265
+ emb_out = emb_out[..., None]
266
+ if self.use_scale_shift_norm:
267
+ out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
268
+ scale, shift = th.chunk(emb_out, 2, dim=1)
269
+ h = out_norm(h) * (1 + scale) + shift
270
+ h = out_rest(h)
271
+ else:
272
+ h = h + emb_out
273
+ h = self.out_layers(h)
274
+ return self.skip_connection(x) + h
275
+
276
+
277
+ class AttentionBlock(nn.Module):
278
+ """
279
+ An attention block that allows spatial positions to attend to each other.
280
+ Originally ported from here, but adapted to the N-d case.
281
+ https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
282
+ """
283
+
284
+ def __init__(
285
+ self,
286
+ channels,
287
+ num_heads=1,
288
+ num_head_channels=-1,
289
+ use_checkpoint=False,
290
+ use_new_attention_order=False,
291
+ ):
292
+ super().__init__()
293
+ self.channels = channels
294
+ if num_head_channels == -1:
295
+ self.num_heads = num_heads
296
+ else:
297
+ assert (
298
+ channels % num_head_channels == 0
299
+ ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
300
+ self.num_heads = channels // num_head_channels
301
+ self.use_checkpoint = use_checkpoint
302
+ self.norm = normalization(channels)
303
+ self.qkv = conv_nd(1, channels, channels * 3, 1)
304
+ if use_new_attention_order:
305
+ # split qkv before split heads
306
+ self.attention = QKVAttention(self.num_heads)
307
+ else:
308
+ # split heads before split qkv
309
+ self.attention = QKVAttentionLegacy(self.num_heads)
310
+
311
+ self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
312
+
313
+ def forward(self, x):
314
+ return checkpoint(self._forward, (x,), self.parameters(), True) # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
315
+ #return pt_checkpoint(self._forward, x) # pytorch
316
+
317
+ def _forward(self, x):
318
+ b, c, *spatial = x.shape
319
+ x = x.reshape(b, c, -1)
320
+ qkv = self.qkv(self.norm(x))
321
+ h = self.attention(qkv)
322
+ h = self.proj_out(h)
323
+ return (x + h).reshape(b, c, *spatial)
324
+
325
+
326
+ def count_flops_attn(model, _x, y):
327
+ """
328
+ A counter for the `thop` package to count the operations in an
329
+ attention operation.
330
+ Meant to be used like:
331
+ macs, params = thop.profile(
332
+ model,
333
+ inputs=(inputs, timestamps),
334
+ custom_ops={QKVAttention: QKVAttention.count_flops},
335
+ )
336
+ """
337
+ b, c, *spatial = y[0].shape
338
+ num_spatial = int(np.prod(spatial))
339
+ # We perform two matmuls with the same number of ops.
340
+ # The first computes the weight matrix, the second computes
341
+ # the combination of the value vectors.
342
+ matmul_ops = 2 * b * (num_spatial ** 2) * c
343
+ model.total_ops += th.DoubleTensor([matmul_ops])
344
+
345
+
346
+ class QKVAttentionLegacy(nn.Module):
347
+ """
348
+ A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
349
+ """
350
+
351
+ def __init__(self, n_heads):
352
+ super().__init__()
353
+ self.n_heads = n_heads
354
+
355
+ def forward(self, qkv):
356
+ """
357
+ Apply QKV attention.
358
+ :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
359
+ :return: an [N x (H * C) x T] tensor after attention.
360
+ """
361
+ bs, width, length = qkv.shape
362
+ assert width % (3 * self.n_heads) == 0
363
+ ch = width // (3 * self.n_heads)
364
+ q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
365
+ scale = 1 / math.sqrt(math.sqrt(ch))
366
+ weight = th.einsum(
367
+ "bct,bcs->bts", q * scale, k * scale
368
+ ) # More stable with f16 than dividing afterwards
369
+ weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
370
+ a = th.einsum("bts,bcs->bct", weight, v)
371
+ return a.reshape(bs, -1, length)
372
+
373
+ @staticmethod
374
+ def count_flops(model, _x, y):
375
+ return count_flops_attn(model, _x, y)
376
+
377
+
378
+ class QKVAttention(nn.Module):
379
+ """
380
+ A module which performs QKV attention and splits in a different order.
381
+ """
382
+
383
+ def __init__(self, n_heads):
384
+ super().__init__()
385
+ self.n_heads = n_heads
386
+
387
+ def forward(self, qkv):
388
+ """
389
+ Apply QKV attention.
390
+ :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
391
+ :return: an [N x (H * C) x T] tensor after attention.
392
+ """
393
+ bs, width, length = qkv.shape
394
+ assert width % (3 * self.n_heads) == 0
395
+ ch = width // (3 * self.n_heads)
396
+ q, k, v = qkv.chunk(3, dim=1)
397
+ scale = 1 / math.sqrt(math.sqrt(ch))
398
+ weight = th.einsum(
399
+ "bct,bcs->bts",
400
+ (q * scale).view(bs * self.n_heads, ch, length),
401
+ (k * scale).view(bs * self.n_heads, ch, length),
402
+ ) # More stable with f16 than dividing afterwards
403
+ weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
404
+ a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
405
+ return a.reshape(bs, -1, length)
406
+
407
+ @staticmethod
408
+ def count_flops(model, _x, y):
409
+ return count_flops_attn(model, _x, y)
410
+
411
+
412
+ class Timestep(nn.Module):
413
+ def __init__(self, dim):
414
+ super().__init__()
415
+ self.dim = dim
416
+
417
+ def forward(self, t):
418
+ return timestep_embedding(t, self.dim)
419
+
420
+
421
+ class UNetModel(nn.Module):
422
+ """
423
+ The full UNet model with attention and timestep embedding.
424
+ :param in_channels: channels in the input Tensor.
425
+ :param model_channels: base channel count for the model.
426
+ :param out_channels: channels in the output Tensor.
427
+ :param num_res_blocks: number of residual blocks per downsample.
428
+ :param attention_resolutions: a collection of downsample rates at which
429
+ attention will take place. May be a set, list, or tuple.
430
+ For example, if this contains 4, then at 4x downsampling, attention
431
+ will be used.
432
+ :param dropout: the dropout probability.
433
+ :param channel_mult: channel multiplier for each level of the UNet.
434
+ :param conv_resample: if True, use learned convolutions for upsampling and
435
+ downsampling.
436
+ :param dims: determines if the signal is 1D, 2D, or 3D.
437
+ :param num_classes: if specified (as an int), then this model will be
438
+ class-conditional with `num_classes` classes.
439
+ :param use_checkpoint: use gradient checkpointing to reduce memory usage.
440
+ :param num_heads: the number of attention heads in each attention layer.
441
+ :param num_heads_channels: if specified, ignore num_heads and instead use
442
+ a fixed channel width per attention head.
443
+ :param num_heads_upsample: works with num_heads to set a different number
444
+ of heads for upsampling. Deprecated.
445
+ :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
446
+ :param resblock_updown: use residual blocks for up/downsampling.
447
+ :param use_new_attention_order: use a different attention pattern for potentially
448
+ increased efficiency.
449
+ """
450
+
451
+ def __init__(
452
+ self,
453
+ image_size,
454
+ in_channels,
455
+ model_channels,
456
+ out_channels,
457
+ num_res_blocks,
458
+ attention_resolutions,
459
+ dropout=0,
460
+ channel_mult=(1, 2, 4, 8),
461
+ conv_resample=True,
462
+ dims=2,
463
+ num_classes=None,
464
+ use_checkpoint=False,
465
+ use_fp16=False,
466
+ use_bf16=False,
467
+ num_heads=-1,
468
+ num_head_channels=-1,
469
+ num_heads_upsample=-1,
470
+ use_scale_shift_norm=False,
471
+ resblock_updown=False,
472
+ use_new_attention_order=False,
473
+ use_spatial_transformer=False, # custom transformer support
474
+ transformer_depth=1, # custom transformer support
475
+ context_dim=None, # custom transformer support
476
+ n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model
477
+ legacy=True,
478
+ disable_self_attentions=None,
479
+ num_attention_blocks=None,
480
+ disable_middle_self_attn=False,
481
+ use_linear_in_transformer=False,
482
+ adm_in_channels=None,
483
+ ):
484
+ super().__init__()
485
+ if use_spatial_transformer:
486
+ assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'
487
+
488
+ if context_dim is not None:
489
+ assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
490
+ from omegaconf.listconfig import ListConfig
491
+ if type(context_dim) == ListConfig:
492
+ context_dim = list(context_dim)
493
+
494
+ if num_heads_upsample == -1:
495
+ num_heads_upsample = num_heads
496
+
497
+ if num_heads == -1:
498
+ assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
499
+
500
+ if num_head_channels == -1:
501
+ assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
502
+
503
+ self.image_size = image_size
504
+ self.in_channels = in_channels
505
+ self.model_channels = model_channels
506
+ self.out_channels = out_channels
507
+ if isinstance(num_res_blocks, int):
508
+ self.num_res_blocks = len(channel_mult) * [num_res_blocks]
509
+ else:
510
+ if len(num_res_blocks) != len(channel_mult):
511
+ raise ValueError("provide num_res_blocks either as an int (globally constant) or "
512
+ "as a list/tuple (per-level) with the same length as channel_mult")
513
+ self.num_res_blocks = num_res_blocks
514
+ if disable_self_attentions is not None:
515
+ # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
516
+ assert len(disable_self_attentions) == len(channel_mult)
517
+ if num_attention_blocks is not None:
518
+ assert len(num_attention_blocks) == len(self.num_res_blocks)
519
+ assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks))))
520
+ print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
521
+ f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
522
+ f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
523
+ f"attention will still not be set.")
524
+
525
+ self.attention_resolutions = attention_resolutions
526
+ self.dropout = dropout
527
+ self.channel_mult = channel_mult
528
+ self.conv_resample = conv_resample
529
+ self.num_classes = num_classes
530
+ self.use_checkpoint = use_checkpoint
531
+ self.dtype = th.float16 if use_fp16 else th.float32
532
+ self.dtype = th.bfloat16 if use_bf16 else self.dtype
533
+ self.num_heads = num_heads
534
+ self.num_head_channels = num_head_channels
535
+ self.num_heads_upsample = num_heads_upsample
536
+ self.predict_codebook_ids = n_embed is not None
537
+
538
+ time_embed_dim = model_channels * 4
539
+ self.time_embed = nn.Sequential(
540
+ linear(model_channels, time_embed_dim),
541
+ nn.SiLU(),
542
+ linear(time_embed_dim, time_embed_dim),
543
+ )
544
+
545
+ if self.num_classes is not None:
546
+ if isinstance(self.num_classes, int):
547
+ self.label_emb = nn.Embedding(num_classes, time_embed_dim)
548
+ elif self.num_classes == "continuous":
549
+ print("setting up linear c_adm embedding layer")
550
+ self.label_emb = nn.Linear(1, time_embed_dim)
551
+ elif self.num_classes == "sequential":
552
+ assert adm_in_channels is not None
553
+ self.label_emb = nn.Sequential(
554
+ nn.Sequential(
555
+ linear(adm_in_channels, time_embed_dim),
556
+ nn.SiLU(),
557
+ linear(time_embed_dim, time_embed_dim),
558
+ )
559
+ )
560
+ else:
561
+ raise ValueError()
562
+
563
+ self.input_blocks = nn.ModuleList(
564
+ [
565
+ TimestepEmbedSequential(
566
+ conv_nd(dims, in_channels, model_channels, 3, padding=1)
567
+ )
568
+ ]
569
+ )
570
+ self._feature_size = model_channels
571
+ input_block_chans = [model_channels]
572
+ ch = model_channels
573
+ ds = 1
574
+ for level, mult in enumerate(channel_mult):
575
+ for nr in range(self.num_res_blocks[level]):
576
+ layers = [
577
+ ResBlock(
578
+ ch,
579
+ time_embed_dim,
580
+ dropout,
581
+ out_channels=mult * model_channels,
582
+ dims=dims,
583
+ use_checkpoint=use_checkpoint,
584
+ use_scale_shift_norm=use_scale_shift_norm,
585
+ )
586
+ ]
587
+ ch = mult * model_channels
588
+ if ds in attention_resolutions:
589
+ if num_head_channels == -1:
590
+ dim_head = ch // num_heads
591
+ else:
592
+ num_heads = ch // num_head_channels
593
+ dim_head = num_head_channels
594
+ if legacy:
595
+ #num_heads = 1
596
+ dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
597
+ if exists(disable_self_attentions):
598
+ disabled_sa = disable_self_attentions[level]
599
+ else:
600
+ disabled_sa = False
601
+
602
+ if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
603
+ layers.append(
604
+ AttentionBlock(
605
+ ch,
606
+ use_checkpoint=use_checkpoint,
607
+ num_heads=num_heads,
608
+ num_head_channels=dim_head,
609
+ use_new_attention_order=use_new_attention_order,
610
+ ) if not use_spatial_transformer else SpatialTransformer(
611
+ ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
612
+ disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
613
+ use_checkpoint=use_checkpoint
614
+ )
615
+ )
616
+ self.input_blocks.append(TimestepEmbedSequential(*layers))
617
+ self._feature_size += ch
618
+ input_block_chans.append(ch)
619
+ if level != len(channel_mult) - 1:
620
+ out_ch = ch
621
+ self.input_blocks.append(
622
+ TimestepEmbedSequential(
623
+ ResBlock(
624
+ ch,
625
+ time_embed_dim,
626
+ dropout,
627
+ out_channels=out_ch,
628
+ dims=dims,
629
+ use_checkpoint=use_checkpoint,
630
+ use_scale_shift_norm=use_scale_shift_norm,
631
+ down=True,
632
+ )
633
+ if resblock_updown
634
+ else Downsample(
635
+ ch, conv_resample, dims=dims, out_channels=out_ch
636
+ )
637
+ )
638
+ )
639
+ ch = out_ch
640
+ input_block_chans.append(ch)
641
+ ds *= 2
642
+ self._feature_size += ch
643
+
644
+ if num_head_channels == -1:
645
+ dim_head = ch // num_heads
646
+ else:
647
+ num_heads = ch // num_head_channels
648
+ dim_head = num_head_channels
649
+ if legacy:
650
+ #num_heads = 1
651
+ dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
652
+ self.middle_block = TimestepEmbedSequential(
653
+ ResBlock(
654
+ ch,
655
+ time_embed_dim,
656
+ dropout,
657
+ dims=dims,
658
+ use_checkpoint=use_checkpoint,
659
+ use_scale_shift_norm=use_scale_shift_norm,
660
+ ),
661
+ AttentionBlock(
662
+ ch,
663
+ use_checkpoint=use_checkpoint,
664
+ num_heads=num_heads,
665
+ num_head_channels=dim_head,
666
+ use_new_attention_order=use_new_attention_order,
667
+ ) if not use_spatial_transformer else SpatialTransformer( # always uses a self-attn
668
+ ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
669
+ disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
670
+ use_checkpoint=use_checkpoint
671
+ ),
672
+ ResBlock(
673
+ ch,
674
+ time_embed_dim,
675
+ dropout,
676
+ dims=dims,
677
+ use_checkpoint=use_checkpoint,
678
+ use_scale_shift_norm=use_scale_shift_norm,
679
+ ),
680
+ )
681
+ self._feature_size += ch
682
+
683
+ self.output_blocks = nn.ModuleList([])
684
+ for level, mult in list(enumerate(channel_mult))[::-1]:
685
+ for i in range(self.num_res_blocks[level] + 1):
686
+ ich = input_block_chans.pop()
687
+ layers = [
688
+ ResBlock(
689
+ ch + ich,
690
+ time_embed_dim,
691
+ dropout,
692
+ out_channels=model_channels * mult,
693
+ dims=dims,
694
+ use_checkpoint=use_checkpoint,
695
+ use_scale_shift_norm=use_scale_shift_norm,
696
+ )
697
+ ]
698
+ ch = model_channels * mult
699
+ if ds in attention_resolutions:
700
+ if num_head_channels == -1:
701
+ dim_head = ch // num_heads
702
+ else:
703
+ num_heads = ch // num_head_channels
704
+ dim_head = num_head_channels
705
+ if legacy:
706
+ #num_heads = 1
707
+ dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
708
+ if exists(disable_self_attentions):
709
+ disabled_sa = disable_self_attentions[level]
710
+ else:
711
+ disabled_sa = False
712
+
713
+ if not exists(num_attention_blocks) or i < num_attention_blocks[level]:
714
+ layers.append(
715
+ AttentionBlock(
716
+ ch,
717
+ use_checkpoint=use_checkpoint,
718
+ num_heads=num_heads_upsample,
719
+ num_head_channels=dim_head,
720
+ use_new_attention_order=use_new_attention_order,
721
+ ) if not use_spatial_transformer else SpatialTransformer(
722
+ ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
723
+ disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
724
+ use_checkpoint=use_checkpoint
725
+ )
726
+ )
727
+ if level and i == self.num_res_blocks[level]:
728
+ out_ch = ch
729
+ layers.append(
730
+ ResBlock(
731
+ ch,
732
+ time_embed_dim,
733
+ dropout,
734
+ out_channels=out_ch,
735
+ dims=dims,
736
+ use_checkpoint=use_checkpoint,
737
+ use_scale_shift_norm=use_scale_shift_norm,
738
+ up=True,
739
+ )
740
+ if resblock_updown
741
+ else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
742
+ )
743
+ ds //= 2
744
+ self.output_blocks.append(TimestepEmbedSequential(*layers))
745
+ self._feature_size += ch
746
+
747
+ self.out = nn.Sequential(
748
+ normalization(ch),
749
+ nn.SiLU(),
750
+ zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
751
+ )
752
+ if self.predict_codebook_ids:
753
+ self.id_predictor = nn.Sequential(
754
+ normalization(ch),
755
+ conv_nd(dims, model_channels, n_embed, 1),
756
+ #nn.LogSoftmax(dim=1) # change to cross_entropy and produce non-normalized logits
757
+ )
758
+
759
+ def convert_to_fp16(self):
760
+ """
761
+ Convert the torso of the model to float16.
762
+ """
763
+ self.input_blocks.apply(convert_module_to_f16)
764
+ self.middle_block.apply(convert_module_to_f16)
765
+ self.output_blocks.apply(convert_module_to_f16)
766
+
767
+ def convert_to_fp32(self):
768
+ """
769
+ Convert the torso of the model to float32.
770
+ """
771
+ self.input_blocks.apply(convert_module_to_f32)
772
+ self.middle_block.apply(convert_module_to_f32)
773
+ self.output_blocks.apply(convert_module_to_f32)
774
+
775
+ def forward(self, x, timesteps=None, context=None, y=None,**kwargs):
776
+ """
777
+ Apply the model to an input batch.
778
+ :param x: an [N x C x ...] Tensor of inputs.
779
+ :param timesteps: a 1-D batch of timesteps.
780
+ :param context: conditioning plugged in via crossattn
781
+ :param y: an [N] Tensor of labels, if class-conditional.
782
+ :return: an [N x C x ...] Tensor of outputs.
783
+ """
784
+ assert (y is not None) == (
785
+ self.num_classes is not None
786
+ ), "must specify y if and only if the model is class-conditional"
787
+ hs = []
788
+ t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
789
+ emb = self.time_embed(t_emb)
790
+
791
+ if self.num_classes is not None:
792
+ assert y.shape[0] == x.shape[0]
793
+ emb = emb + self.label_emb(y)
794
+
795
+ h = x.type(self.dtype)
796
+ for module in self.input_blocks:
797
+ h = module(h, emb, context)
798
+ hs.append(h)
799
+ h = self.middle_block(h, emb, context)
800
+ for module in self.output_blocks:
801
+ h = th.cat([h, hs.pop()], dim=1)
802
+ h = module(h, emb, context)
803
+ h = h.type(x.dtype)
804
+ if self.predict_codebook_ids:
805
+ return self.id_predictor(h)
806
+ else:
807
+ return self.out(h)
ldm/modules/diffusionmodules/upscaling.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import numpy as np
4
+ from functools import partial
5
+
6
+ from ldm.modules.diffusionmodules.util import extract_into_tensor, make_beta_schedule
7
+ from ldm.util import default
8
+
9
+
10
+ class AbstractLowScaleModel(nn.Module):
11
+ # for concatenating a downsampled image to the latent representation
12
+ def __init__(self, noise_schedule_config=None):
13
+ super(AbstractLowScaleModel, self).__init__()
14
+ if noise_schedule_config is not None:
15
+ self.register_schedule(**noise_schedule_config)
16
+
17
+ def register_schedule(self, beta_schedule="linear", timesteps=1000,
18
+ linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
19
+ betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end,
20
+ cosine_s=cosine_s)
21
+ alphas = 1. - betas
22
+ alphas_cumprod = np.cumprod(alphas, axis=0)
23
+ alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
24
+
25
+ timesteps, = betas.shape
26
+ self.num_timesteps = int(timesteps)
27
+ self.linear_start = linear_start
28
+ self.linear_end = linear_end
29
+ assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
30
+
31
+ to_torch = partial(torch.tensor, dtype=torch.float32)
32
+
33
+ self.register_buffer('betas', to_torch(betas))
34
+ self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
35
+ self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
36
+
37
+ # calculations for diffusion q(x_t | x_{t-1}) and others
38
+ self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
39
+ self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
40
+ self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
41
+ self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
42
+ self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
43
+
44
+ def q_sample(self, x_start, t, noise=None):
45
+ noise = default(noise, lambda: torch.randn_like(x_start))
46
+ return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
47
+ extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
48
+
49
+ def forward(self, x):
50
+ return x, None
51
+
52
+ def decode(self, x):
53
+ return x
54
+
55
+
56
+ class SimpleImageConcat(AbstractLowScaleModel):
57
+ # no noise level conditioning
58
+ def __init__(self):
59
+ super(SimpleImageConcat, self).__init__(noise_schedule_config=None)
60
+ self.max_noise_level = 0
61
+
62
+ def forward(self, x):
63
+ # fix to constant noise level
64
+ return x, torch.zeros(x.shape[0], device=x.device).long()
65
+
66
+
67
+ class ImageConcatWithNoiseAugmentation(AbstractLowScaleModel):
68
+ def __init__(self, noise_schedule_config, max_noise_level=1000, to_cuda=False):
69
+ super().__init__(noise_schedule_config=noise_schedule_config)
70
+ self.max_noise_level = max_noise_level
71
+
72
+ def forward(self, x, noise_level=None):
73
+ if noise_level is None:
74
+ noise_level = torch.randint(0, self.max_noise_level, (x.shape[0],), device=x.device).long()
75
+ else:
76
+ assert isinstance(noise_level, torch.Tensor)
77
+ z = self.q_sample(x, noise_level)
78
+ return z, noise_level
79
+
80
+
81
+
ldm/modules/diffusionmodules/util.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # adopted from
2
+ # https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
3
+ # and
4
+ # https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
5
+ # and
6
+ # https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
7
+ #
8
+ # thanks!
9
+
10
+
11
+ import os
12
+ import math
13
+ import torch
14
+ import torch.nn as nn
15
+ import numpy as np
16
+ from einops import repeat
17
+
18
+ from ldm.util import instantiate_from_config
19
+
20
+
21
+ def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
22
+ if schedule == "linear":
23
+ betas = (
24
+ torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
25
+ )
26
+
27
+ elif schedule == "cosine":
28
+ timesteps = (
29
+ torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
30
+ )
31
+ alphas = timesteps / (1 + cosine_s) * np.pi / 2
32
+ alphas = torch.cos(alphas).pow(2)
33
+ alphas = alphas / alphas[0]
34
+ betas = 1 - alphas[1:] / alphas[:-1]
35
+ betas = np.clip(betas, a_min=0, a_max=0.999)
36
+
37
+ elif schedule == "squaredcos_cap_v2": # used for karlo prior
38
+ # return early
39
+ return betas_for_alpha_bar(
40
+ n_timestep,
41
+ lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
42
+ )
43
+
44
+ elif schedule == "sqrt_linear":
45
+ betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
46
+ elif schedule == "sqrt":
47
+ betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5
48
+ else:
49
+ raise ValueError(f"schedule '{schedule}' unknown.")
50
+ return betas.numpy()
51
+
52
+
53
+ def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
54
+ if ddim_discr_method == 'uniform':
55
+ c = num_ddpm_timesteps // num_ddim_timesteps
56
+ ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
57
+ elif ddim_discr_method == 'quad':
58
+ ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int)
59
+ else:
60
+ raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
61
+
62
+ # assert ddim_timesteps.shape[0] == num_ddim_timesteps
63
+ # add one to get the final alpha values right (the ones from first scale to data during sampling)
64
+ steps_out = ddim_timesteps + 1
65
+ if verbose:
66
+ print(f'Selected timesteps for ddim sampler: {steps_out}')
67
+ return steps_out
68
+
69
+
70
+ def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
71
+ # select alphas for computing the variance schedule
72
+ alphas = alphacums[ddim_timesteps]
73
+ alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
74
+
75
+ # according the the formula provided in https://arxiv.org/abs/2010.02502
76
+ sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
77
+ if verbose:
78
+ print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
79
+ print(f'For the chosen value of eta, which is {eta}, '
80
+ f'this results in the following sigma_t schedule for ddim sampler {sigmas}')
81
+ return sigmas, alphas, alphas_prev
82
+
83
+
84
+ def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
85
+ """
86
+ Create a beta schedule that discretizes the given alpha_t_bar function,
87
+ which defines the cumulative product of (1-beta) over time from t = [0,1].
88
+ :param num_diffusion_timesteps: the number of betas to produce.
89
+ :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
90
+ produces the cumulative product of (1-beta) up to that
91
+ part of the diffusion process.
92
+ :param max_beta: the maximum beta to use; use values lower than 1 to
93
+ prevent singularities.
94
+ """
95
+ betas = []
96
+ for i in range(num_diffusion_timesteps):
97
+ t1 = i / num_diffusion_timesteps
98
+ t2 = (i + 1) / num_diffusion_timesteps
99
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
100
+ return np.array(betas)
101
+
102
+
103
+ def extract_into_tensor(a, t, x_shape):
104
+ b, *_ = t.shape
105
+ out = a.gather(-1, t)
106
+ return out.reshape(b, *((1,) * (len(x_shape) - 1)))
107
+
108
+
109
+ def checkpoint(func, inputs, params, flag):
110
+ """
111
+ Evaluate a function without caching intermediate activations, allowing for
112
+ reduced memory at the expense of extra compute in the backward pass.
113
+ :param func: the function to evaluate.
114
+ :param inputs: the argument sequence to pass to `func`.
115
+ :param params: a sequence of parameters `func` depends on but does not
116
+ explicitly take as arguments.
117
+ :param flag: if False, disable gradient checkpointing.
118
+ """
119
+ if flag:
120
+ args = tuple(inputs) + tuple(params)
121
+ return CheckpointFunction.apply(func, len(inputs), *args)
122
+ else:
123
+ return func(*inputs)
124
+
125
+
126
+ class CheckpointFunction(torch.autograd.Function):
127
+ @staticmethod
128
+ def forward(ctx, run_function, length, *args):
129
+ ctx.run_function = run_function
130
+ ctx.input_tensors = list(args[:length])
131
+ ctx.input_params = list(args[length:])
132
+ ctx.gpu_autocast_kwargs = {"enabled": torch.is_autocast_enabled(),
133
+ "dtype": torch.get_autocast_gpu_dtype(),
134
+ "cache_enabled": torch.is_autocast_cache_enabled()}
135
+ with torch.no_grad():
136
+ output_tensors = ctx.run_function(*ctx.input_tensors)
137
+ return output_tensors
138
+
139
+ @staticmethod
140
+ def backward(ctx, *output_grads):
141
+ ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
142
+ with torch.enable_grad(), \
143
+ torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs):
144
+ # Fixes a bug where the first op in run_function modifies the
145
+ # Tensor storage in place, which is not allowed for detach()'d
146
+ # Tensors.
147
+ shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
148
+ output_tensors = ctx.run_function(*shallow_copies)
149
+ input_grads = torch.autograd.grad(
150
+ output_tensors,
151
+ ctx.input_tensors + ctx.input_params,
152
+ output_grads,
153
+ allow_unused=True,
154
+ )
155
+ del ctx.input_tensors
156
+ del ctx.input_params
157
+ del output_tensors
158
+ return (None, None) + input_grads
159
+
160
+
161
+ def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
162
+ """
163
+ Create sinusoidal timestep embeddings.
164
+ :param timesteps: a 1-D Tensor of N indices, one per batch element.
165
+ These may be fractional.
166
+ :param dim: the dimension of the output.
167
+ :param max_period: controls the minimum frequency of the embeddings.
168
+ :return: an [N x dim] Tensor of positional embeddings.
169
+ """
170
+ if not repeat_only:
171
+ half = dim // 2
172
+ freqs = torch.exp(
173
+ -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
174
+ ).to(device=timesteps.device)
175
+ args = timesteps[:, None].float() * freqs[None]
176
+ embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
177
+ if dim % 2:
178
+ embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
179
+ else:
180
+ embedding = repeat(timesteps, 'b -> b d', d=dim)
181
+ return embedding
182
+
183
+
184
+ def zero_module(module):
185
+ """
186
+ Zero out the parameters of a module and return it.
187
+ """
188
+ for p in module.parameters():
189
+ p.detach().zero_()
190
+ return module
191
+
192
+
193
+ def scale_module(module, scale):
194
+ """
195
+ Scale the parameters of a module and return it.
196
+ """
197
+ for p in module.parameters():
198
+ p.detach().mul_(scale)
199
+ return module
200
+
201
+
202
+ def mean_flat(tensor):
203
+ """
204
+ Take the mean over all non-batch dimensions.
205
+ """
206
+ return tensor.mean(dim=list(range(1, len(tensor.shape))))
207
+
208
+
209
+ def normalization(channels):
210
+ """
211
+ Make a standard normalization layer.
212
+ :param channels: number of input channels.
213
+ :return: an nn.Module for normalization.
214
+ """
215
+ return GroupNorm32(32, channels)
216
+
217
+
218
+ # PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
219
+ class SiLU(nn.Module):
220
+ def forward(self, x):
221
+ return x * torch.sigmoid(x)
222
+
223
+
224
+ class GroupNorm32(nn.GroupNorm):
225
+ def forward(self, x):
226
+ return super().forward(x.float()).type(x.dtype)
227
+
228
+
229
+ def conv_nd(dims, *args, **kwargs):
230
+ """
231
+ Create a 1D, 2D, or 3D convolution module.
232
+ """
233
+ if dims == 1:
234
+ return nn.Conv1d(*args, **kwargs)
235
+ elif dims == 2:
236
+ return nn.Conv2d(*args, **kwargs)
237
+ elif dims == 3:
238
+ return nn.Conv3d(*args, **kwargs)
239
+ raise ValueError(f"unsupported dimensions: {dims}")
240
+
241
+
242
+ def linear(*args, **kwargs):
243
+ """
244
+ Create a linear module.
245
+ """
246
+ return nn.Linear(*args, **kwargs)
247
+
248
+
249
+ def avg_pool_nd(dims, *args, **kwargs):
250
+ """
251
+ Create a 1D, 2D, or 3D average pooling module.
252
+ """
253
+ if dims == 1:
254
+ return nn.AvgPool1d(*args, **kwargs)
255
+ elif dims == 2:
256
+ return nn.AvgPool2d(*args, **kwargs)
257
+ elif dims == 3:
258
+ return nn.AvgPool3d(*args, **kwargs)
259
+ raise ValueError(f"unsupported dimensions: {dims}")
260
+
261
+
262
+ class HybridConditioner(nn.Module):
263
+
264
+ def __init__(self, c_concat_config, c_crossattn_config):
265
+ super().__init__()
266
+ self.concat_conditioner = instantiate_from_config(c_concat_config)
267
+ self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
268
+
269
+ def forward(self, c_concat, c_crossattn):
270
+ c_concat = self.concat_conditioner(c_concat)
271
+ c_crossattn = self.crossattn_conditioner(c_crossattn)
272
+ return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
273
+
274
+
275
+ def noise_like(shape, device, repeat=False):
276
+ repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
277
+ noise = lambda: torch.randn(shape, device=device)
278
+ return repeat_noise() if repeat else noise()
ldm/modules/distributions/__init__.py ADDED
File without changes
ldm/modules/distributions/distributions.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+
4
+
5
+ class AbstractDistribution:
6
+ def sample(self):
7
+ raise NotImplementedError()
8
+
9
+ def mode(self):
10
+ raise NotImplementedError()
11
+
12
+
13
+ class DiracDistribution(AbstractDistribution):
14
+ def __init__(self, value):
15
+ self.value = value
16
+
17
+ def sample(self):
18
+ return self.value
19
+
20
+ def mode(self):
21
+ return self.value
22
+
23
+
24
+ class DiagonalGaussianDistribution(object):
25
+ def __init__(self, parameters, deterministic=False):
26
+ self.parameters = parameters
27
+ self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
28
+ self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
29
+ self.deterministic = deterministic
30
+ self.std = torch.exp(0.5 * self.logvar)
31
+ self.var = torch.exp(self.logvar)
32
+ if self.deterministic:
33
+ self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
34
+
35
+ def sample(self):
36
+ x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
37
+ return x
38
+
39
+ def kl(self, other=None):
40
+ if self.deterministic:
41
+ return torch.Tensor([0.])
42
+ else:
43
+ if other is None:
44
+ return 0.5 * torch.sum(torch.pow(self.mean, 2)
45
+ + self.var - 1.0 - self.logvar,
46
+ dim=[1, 2, 3])
47
+ else:
48
+ return 0.5 * torch.sum(
49
+ torch.pow(self.mean - other.mean, 2) / other.var
50
+ + self.var / other.var - 1.0 - self.logvar + other.logvar,
51
+ dim=[1, 2, 3])
52
+
53
+ def nll(self, sample, dims=[1,2,3]):
54
+ if self.deterministic:
55
+ return torch.Tensor([0.])
56
+ logtwopi = np.log(2.0 * np.pi)
57
+ return 0.5 * torch.sum(
58
+ logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
59
+ dim=dims)
60
+
61
+ def mode(self):
62
+ return self.mean
63
+
64
+
65
+ def normal_kl(mean1, logvar1, mean2, logvar2):
66
+ """
67
+ source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
68
+ Compute the KL divergence between two gaussians.
69
+ Shapes are automatically broadcasted, so batches can be compared to
70
+ scalars, among other use cases.
71
+ """
72
+ tensor = None
73
+ for obj in (mean1, logvar1, mean2, logvar2):
74
+ if isinstance(obj, torch.Tensor):
75
+ tensor = obj
76
+ break
77
+ assert tensor is not None, "at least one argument must be a Tensor"
78
+
79
+ # Force variances to be Tensors. Broadcasting helps convert scalars to
80
+ # Tensors, but it does not work for torch.exp().
81
+ logvar1, logvar2 = [
82
+ x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
83
+ for x in (logvar1, logvar2)
84
+ ]
85
+
86
+ return 0.5 * (
87
+ -1.0
88
+ + logvar2
89
+ - logvar1
90
+ + torch.exp(logvar1 - logvar2)
91
+ + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
92
+ )