paracanthurus amankishore commited on
Commit
f327edf
0 Parent(s):

Duplicate from MirageML/sjc

Browse files

Co-authored-by: Aman Kishore <amankishore@users.noreply.huggingface.co>

This view is limited to 50 files because it contains too many changes.   See raw diff
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ release/diffusion_ckpts/guided_ddpm/models/lsun_bedroom.pt filter=lfs diff=lfs merge=lfs -text
36
+ release/diffusion_ckpts/guided_ddpm/models/lsun_ffhq.pt filter=lfs diff=lfs merge=lfs -text
37
+ release/diffusion_ckpts/stable_diffusion/sd-v1-5.ckpt filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.png
2
+
3
+ # sd1/
4
+ # sd2/
5
+
6
+ sde/
7
+
8
+ notebooks/
9
+ out/
10
+ slurm_outputs/
11
+
12
+ FID/torch_utils/
13
+ FID/dnnlib/
14
+
15
+ # Byte-compiled / optimized / DLL files
16
+ __pycache__/
17
+ *.py[cod]
18
+ *$py.class
19
+
20
+ # C extensions
21
+ *.so
22
+
23
+ # Distribution / packaging
24
+ .Python
25
+ build/
26
+ develop-eggs/
27
+ dist/
28
+ downloads/
29
+ eggs/
30
+ .eggs/
31
+ lib/
32
+ lib64/
33
+ parts/
34
+ sdist/
35
+ var/
36
+ wheels/
37
+ pip-wheel-metadata/
38
+ share/python-wheels/
39
+ *.egg-info/
40
+ .installed.cfg
41
+ *.egg
42
+ MANIFEST
43
+
44
+ # PyInstaller
45
+ # Usually these files are written by a python script from a template
46
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
47
+ *.manifest
48
+ *.spec
49
+
50
+ # Installer logs
51
+ pip-log.txt
52
+ pip-delete-this-directory.txt
53
+
54
+ # Unit test / coverage reports
55
+ htmlcov/
56
+ .tox/
57
+ .nox/
58
+ .coverage
59
+ .coverage.*
60
+ .cache
61
+ nosetests.xml
62
+ coverage.xml
63
+ *.cover
64
+ *.py,cover
65
+ .hypothesis/
66
+ .pytest_cache/
67
+
68
+ # Translations
69
+ *.mo
70
+ *.pot
71
+
72
+ # Django stuff:
73
+ *.log
74
+ local_settings.py
75
+ db.sqlite3
76
+ db.sqlite3-journal
77
+
78
+ # Flask stuff:
79
+ instance/
80
+ .webassets-cache
81
+
82
+ # Scrapy stuff:
83
+ .scrapy
84
+
85
+ # Sphinx documentation
86
+ docs/_build/
87
+
88
+ # PyBuilder
89
+ target/
90
+
91
+ # Jupyter Notebook
92
+ .ipynb_checkpoints
93
+
94
+ # IPython
95
+ profile_default/
96
+ ipython_config.py
97
+
98
+ # pyenv
99
+ .python-version
100
+
101
+ # pipenv
102
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
103
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
104
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
105
+ # install all needed dependencies.
106
+ #Pipfile.lock
107
+
108
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
109
+ __pypackages__/
110
+
111
+ # Celery stuff
112
+ celerybeat-schedule
113
+ celerybeat.pid
114
+
115
+ # SageMath parsed files
116
+ *.sage.py
117
+
118
+ # Environments
119
+ .env
120
+ .venv
121
+ env/
122
+ venv/
123
+ ENV/
124
+ env.bak/
125
+ venv.bak/
126
+
127
+ # Spyder project settings
128
+ .spyderproject
129
+ .spyproject
130
+
131
+ # Rope project settings
132
+ .ropeproject
133
+
134
+ # mkdocs documentation
135
+ /site
136
+
137
+ # mypy
138
+ .mypy_cache/
139
+ .dmypy.json
140
+ dmypy.json
141
+
142
+ # Pyre type checker
143
+ .pyre/
144
+
145
+ ckpt/
146
+ depth/
147
+ img/
148
+ test*/
149
+ view/
150
+ vis/
LICENSE ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2022 Score Jacobian Chaining authors
2
+
3
+ CreativeML Open RAIL-M
4
+ dated August 22, 2022
5
+
6
+ Section I: PREAMBLE
7
+
8
+ Multimodal generative models are being widely adopted and used, and have the potential to transform the way artists, among other individuals, conceive and benefit from AI or ML technologies as a tool for content creation.
9
+
10
+ Notwithstanding the current and potential benefits that these artifacts can bring to society at large, there are also concerns about potential misuses of them, either due to their technical limitations or ethical considerations.
11
+
12
+ In short, this license strives for both the open and responsible downstream use of the accompanying model. When it comes to the open character, we took inspiration from open source permissive licenses regarding the grant of IP rights. Referring to the downstream responsible use, we added use-based restrictions not permitting the use of the Model in very specific scenarios, in order for the licensor to be able to enforce the license in case potential misuses of the Model may occur. At the same time, we strive to promote open and responsible research on generative models for art and content generation.
13
+
14
+ Even though downstream derivative versions of the model could be released under different licensing terms, the latter will always have to include - at minimum - the same use-based restrictions as the ones in the original license (this license). We believe in the intersection between open and responsible AI development; thus, this License aims to strike a balance between both in order to enable responsible open-science in the field of AI.
15
+
16
+ This License governs the use of the model (and its derivatives) and is informed by the model card associated with the model.
17
+
18
+ NOW THEREFORE, You and Licensor agree as follows:
19
+
20
+ 1. Definitions
21
+
22
+ - "License" means the terms and conditions for use, reproduction, and Distribution as defined in this document.
23
+ - "Data" means a collection of information and/or content extracted from the dataset used with the Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not licensed under this License.
24
+ - "Output" means the results of operating a Model as embodied in informational content resulting therefrom.
25
+ - "Model" means any accompanying machine-learning based assemblies (including checkpoints), consisting of learnt weights, parameters (including optimizer states), corresponding to the model architecture as embodied in the Complementary Material, that have been trained or tuned, in whole or in part on the Data, using the Complementary Material.
26
+ - "Derivatives of the Model" means all modifications to the Model, works based on the Model, or any other model which is created or initialized by transfer of patterns of the weights, parameters, activations or output of the Model, to the other model, in order to cause the other model to perform similarly to the Model, including - but not limited to - distillation methods entailing the use of intermediate data representations or methods based on the generation of synthetic data by the Model for training the other model.
27
+ - "Complementary Material" means the accompanying source code and scripts used to define, run, load, benchmark or evaluate the Model, and used to prepare data for training or evaluation, if any. This includes any accompanying documentation, tutorials, examples, etc, if any.
28
+ - "Distribution" means any transmission, reproduction, publication or other sharing of the Model or Derivatives of the Model to a third party, including providing the Model as a hosted service made available by electronic or other remote means - e.g. API-based or web access.
29
+ - "Licensor" means the copyright owner or entity authorized by the copyright owner that is granting the License, including the persons or entities that may have rights in the Model and/or distributing the Model.
30
+ - "You" (or "Your") means an individual or Legal Entity exercising permissions granted by this License and/or making use of the Model for whichever purpose and in any field of use, including usage of the Model in an end-use application - e.g. chatbot, translator, image generator.
31
+ - "Third Parties" means individuals or legal entities that are not under common control with Licensor or You.
32
+ - "Contribution" means any work of authorship, including the original version of the Model and any modifications or additions to that Model or Derivatives of the Model thereof, that is intentionally submitted to Licensor for inclusion in the Model by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Model, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
33
+ - "Contributor" means Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Model.
34
+
35
+ Section II: INTELLECTUAL PROPERTY RIGHTS
36
+
37
+ Both copyright and patent grants apply to the Model, Derivatives of the Model and Complementary Material. The Model and Derivatives of the Model are subject to additional terms as described in Section III.
38
+
39
+ 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare, publicly display, publicly perform, sublicense, and distribute the Complementary Material, the Model, and Derivatives of the Model.
40
+ 3. Grant of Patent License. Subject to the terms and conditions of this License and where and as applicable, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this paragraph) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Model and the Complementary Material, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Model to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model and/or Complementary Material or a Contribution incorporated within the Model and/or Complementary Material constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for the Model and/or Work shall terminate as of the date such litigation is asserted or filed.
41
+
42
+ Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
43
+
44
+ 4. Distribution and Redistribution. You may host for Third Party remote access purposes (e.g. software-as-a-service), reproduce and distribute copies of the Model or Derivatives of the Model thereof in any medium, with or without modifications, provided that You meet the following conditions:
45
+ Use-based restrictions as referenced in paragraph 5 MUST be included as an enforceable provision by You in any type of legal agreement (e.g. a license) governing the use and/or distribution of the Model or Derivatives of the Model, and You shall give notice to subsequent users You Distribute to, that the Model or Derivatives of the Model are subject to paragraph 5. This provision does not apply to the use of Complementary Material.
46
+ You must give any Third Party recipients of the Model or Derivatives of the Model a copy of this License;
47
+ You must cause any modified files to carry prominent notices stating that You changed the files;
48
+ You must retain all copyright, patent, trademark, and attribution notices excluding those notices that do not pertain to any part of the Model, Derivatives of the Model.
49
+ You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions - respecting paragraph 4.a. - for use, reproduction, or Distribution of Your modifications, or for any such Derivatives of the Model as a whole, provided Your use, reproduction, and Distribution of the Model otherwise complies with the conditions stated in this License.
50
+ 5. Use-based restrictions. The restrictions set forth in Attachment A are considered Use-based restrictions. Therefore You cannot use the Model and the Derivatives of the Model for the specified restricted uses. You may use the Model subject to this License, including only for lawful purposes and in accordance with the License. Use may include creating any content with, finetuning, updating, running, training, evaluating and/or reparametrizing the Model. You shall require all of Your users who use the Model or a Derivative of the Model to comply with the terms of this paragraph (paragraph 5).
51
+ 6. The Output You Generate. Except as set forth herein, Licensor claims no rights in the Output You generate using the Model. You are accountable for the Output you generate and its subsequent uses. No use of the output can contravene any provision as stated in the License.
52
+
53
+ Section IV: OTHER PROVISIONS
54
+
55
+ 7. Updates and Runtime Restrictions. To the maximum extent permitted by law, Licensor reserves the right to restrict (remotely or otherwise) usage of the Model in violation of this License, update the Model through electronic means, or modify the Output of the Model based on updates. You shall undertake reasonable efforts to use the latest version of the Model.
56
+ 8. Trademarks and related. Nothing in this License permits You to make use of Licensors’ trademarks, trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between the parties; and any rights not expressly granted herein are reserved by the Licensors.
57
+ 9. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Model and the Complementary Material (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Model, Derivatives of the Model, and the Complementary Material and assume any risks associated with Your exercise of permissions under this License.
58
+ 10. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Model and the Complementary Material (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
59
+ 11. Accepting Warranty or Additional Liability. While redistributing the Model, Derivatives of the Model and the Complementary Material thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
60
+ 12. If any provision of this License is held to be invalid, illegal or unenforceable, the remaining provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
61
+
62
+ END OF TERMS AND CONDITIONS
63
+
64
+
65
+
66
+
67
+ Attachment A
68
+
69
+ Use Restrictions
70
+
71
+ You agree not to use the Model or Derivatives of the Model:
72
+ - In any way that violates any applicable national, federal, state, local or international law or regulation;
73
+ - For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
74
+ - To generate or disseminate verifiably false information and/or content with the purpose of harming others;
75
+ - To generate or disseminate personal identifiable information that can be used to harm an individual;
76
+ - To defame, disparage or otherwise harass others;
77
+ - For fully automated decision making that adversely impacts an individual’s legal rights or otherwise creates or modifies a binding, enforceable obligation;
78
+ - For any use intended to or which has the effect of discriminating against or harming individuals or groups based on online or offline social behavior or known or predicted personal or personality characteristics;
79
+ - To exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
80
+ - For any use intended to or which has the effect of discriminating against individuals or groups based on legally protected characteristics or categories;
81
+ - To provide medical advice and medical results interpretation;
82
+ - To generate or disseminate information for the purpose to be used for administration of justice, law enforcement, immigration or asylum processes, such as predicting an individual will commit fraud/crime commitment (e.g. by text profiling, drawing causal relationships between assertions made in documents, indiscriminate and arbitrarily-targeted use).
README-orig.md ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Score Jacobian Chaining: Lifting Pretrained 2D Diffusion Models for 3D Generation
2
+
3
+ [Haochen Wang*](https://whc.is/),
4
+ [Xiaodan Du*](https://github.com/duxiaodan),
5
+ [Jiahao Li*](https://www.linkedin.com/in/jiahaoli95/),
6
+ [Raymond A. Yeh&dagger;](https://raymond-yeh.com),
7
+ [Greg Shakhnarovich](https://home.ttic.edu/~gregory/)
8
+ (* indicates equal contribution)
9
+
10
+ TTI-Chicago, &dagger;Purdue University
11
+
12
+ Abstract: *A diffusion model learns to predict a vector field of gradients. We propose to apply chain rule on the learned gradients, and back-propagate the score of a diffusion model through the Jacobian of a differentiable renderer, which we instantiate to be a voxel radiance field. This setup aggregates 2D scores at multiple camera viewpoints into a 3D score, and repurposes a pretrained 2D model for 3D data generation. We identify a technical challenge of distribution mismatch that arises in this application, and propose a novel estimation mechanism to resolve it. We run our algorithm on several off-the-shelf diffusion image generative models, including the recently released Stable Diffusion trained on the large-scale LAION dataset.*
13
+
14
+
15
+ <a href="https://arxiv.org/abs/2212.00774"><img src="https://img.shields.io/badge/arXiv-2212.00774-b31b1b.svg" height=22.5></a>
16
+ <a href="https://colab.research.google.com/drive/1zixo66UYGl70VOPy053o7IV_YkQt5lCZ?usp=sharing"><img src="https://colab.research.google.com/assets/colab-badge.svg" height=22.5></a>
17
+ <a href="https://pals.ttic.edu/p/score-jacobian-chaining"><img src="https://img.shields.io/website?down_color=lightgrey&down_message=offline&label=Project%20Page&up_color=lightgreen&up_message=online&url=https%3A%2F%2Fpals.ttic.edu%2Fp%2Fscore-jacobian-chaining" height=22.5></a>
18
+
19
+ <!-- [ [arxiv](https://arxiv.org/abs/2212.00774) | [project page](https://pals.ttic.edu/p/score-jacobian-chaining) | [colab](https://colab.research.google.com/drive/1zixo66UYGl70VOPy053o7IV_YkQt5lCZ?usp=sharing ) ] -->
20
+
21
+ Many thanks to [dvschultz](https://github.com/dvschultz) for the colab.
22
+
23
+ ## Updates
24
+ - We have added subpixel rendering script for final high quality vis. The jittery videos you might have seen should be significantly better now. Please run `python /path/to/sjc/highres_final_vis.py` in the exp folder after the training is complete. There are a few toggles in the script you can play with, but the default is ok. It takes about 5 minutes / 11GB on an A5000, and the extra time is mainly due to SD Decoder.
25
+ - If you are running SJC with a DreamBooth fine-tuned model: the model's output distribution is already significantly narrowed. It might help to use a lower guidance scale `--sd.scale 50.0` for example. Intense mode-seeking is one cause for multi-face problem. We have internally tried DreamBooth with view-dependent prompt fine-tuning. But by and large DreamBooth integration is not ready.
26
+
27
+
28
+ ## TODOs
29
+ - [ ] make seeds configurable. So far all seeds are hardcoded to 0.
30
+ - [ ] add script to reproduce 2D experiments in Fig 4. The Fig might need change once it's tied to seeds. Note that for a simple aligned domain like faces, simple scheduling like using a single σ=1.5 could already generate some nice images. But not so for bedrooms; it's too diverse and annealing seems still needed.
31
+ - [ ] main paper figures did not use subpix rendering; appendix figures did. Replace the main paper figures to make them consistent.
32
+
33
+ ## License
34
+ Since we use Stable Diffusion, we are releasing under their OpenRAIL license. Otherwise we do not
35
+ identify any components or upstream code that carry restrictive licensing requirements.
36
+
37
+ ## Structure
38
+ In addition to SJC, the repo also contains an implementation of [Karras sampler](https://arxiv.org/abs/2206.00364),
39
+ and a customized, simple voxel nerf. We provide the abstract parent class based on Karras et. al. and include
40
+ a few types of diffusion model here. See adapt.py.
41
+
42
+ ## Installation
43
+
44
+ Install Pytorch according to your CUDA version, for example:
45
+ ```bash
46
+ pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
47
+ ```
48
+
49
+ Install other dependencies by `pip install -r requirements.txt`.
50
+
51
+ Install `taming-transformers` manually
52
+ ```bash
53
+ git clone --depth 1 git@github.com:CompVis/taming-transformers.git && pip install -e taming-transformers
54
+ ```
55
+
56
+ ## Downloading checkpoints
57
+ We have bundled a minimal set of things you need to download (SD v1.5 ckpt, gddpm ckpt for LSUN and FFHQ)
58
+ in a tar file, made available at our download server [here](https://dl.ttic.edu/pals/sjc/release.tar).
59
+ It is a single file of 12GB, and you can use wget or curl.
60
+
61
+ Remember to __update__ `env.json` to point at the new checkpoint root where you have uncompressed the files.
62
+
63
+ ## Usage
64
+ Make a new directory to run experiments (the script generates many logging files. Do not run at the root of the code repo, else risk contamination.)
65
+ ```bash
66
+ mkdir exp
67
+ cd exp
68
+ ```
69
+ Run the following command to generate a new 3D asset. It takes about 25 minutes / 10GB GPU mem on a single A5000 GPU for 10000 steps of optimization.
70
+ ```bash
71
+ python /path/to/sjc/run_sjc.py \
72
+ --sd.prompt "A zoomed out high quality photo of Temple of Heaven" \
73
+ --n_steps 10000 \
74
+ --lr 0.05 \
75
+ --sd.scale 100.0 \
76
+ --emptiness_weight 10000 \
77
+ --emptiness_step 0.5 \
78
+ --emptiness_multiplier 20.0 \
79
+ --depth_weight 0 \
80
+ --var_red False
81
+ ```
82
+ `sd.prompt` is the prompt to the stable diffusion model
83
+
84
+ `n_steps` is the number of gradient steps
85
+
86
+ `lr` is the base learning rate of the optimizer
87
+
88
+ `sd.scale` is the guidance scale for stable diffusion
89
+
90
+ `emptiness_weight` is the weighting factor of the emptiness loss
91
+
92
+ `emptiness_step` indicates after `emptiness_step * n_steps` update steps, the `emptiness_weight` is multiplied by `emptiness_multiplier`.
93
+
94
+ `emptiness_multipler` see above
95
+
96
+ `depth_weight` the weighting factor of the center depth loss
97
+
98
+ `var_red` whether to use Eq. 16 vs Eq. 15. For some prompts such as Obama we actually see better results with Eq. 15.
99
+
100
+ Visualization results are stored in the current directory. In directories named `test_*` there are images (under `view`) and videos (under `view_seq`) rendered at different iterations.
101
+
102
+
103
+ ## To Reproduce the Results in the Paper
104
+ First create a clean directory for your experiment, then run one of the following scripts from that folder:
105
+ ### Trump
106
+ ```
107
+ python /path/to/sjc/run_sjc.py --sd.prompt "Trump figure" --n_steps 30000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
108
+ ```
109
+ ### Obama
110
+ ```
111
+ python /path/to/sjc/run_sjc.py --sd.prompt "Obama figure" --n_steps 30000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
112
+ ```
113
+ ### Biden
114
+ ```
115
+ python /path/to/sjc/run_sjc.py --sd.prompt "Biden figure" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
116
+ ```
117
+ ### Temple of Heaven
118
+ ```
119
+ python /path/to/sjc/run_sjc.py --sd.prompt "A zoomed out high quality photo of Temple of Heaven" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
120
+ ```
121
+ ### Burger
122
+ ```
123
+ python /path/to/sjc/run_sjc.py --sd.prompt "A high quality photo of a delicious burger" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
124
+ ```
125
+ ### Icecream
126
+ ```
127
+ python /path/to/sjc/run_sjc.py --sd.prompt "A high quality photo of a chocolate icecream cone" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 10
128
+
129
+ ```
130
+ ### Ficus
131
+ ```
132
+ python /path/to/sjc/run_sjc.py --sd.prompt "A ficus planted in a pot" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 100
133
+ ```
134
+ ### Castle
135
+ ```
136
+ python /path/to/sjc/run_sjc.py --sd.prompt "A zoomed out photo a small castle" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 50
137
+ ```
138
+ ### Sydney Opera House
139
+ ```
140
+ python /path/to/sjc/run_sjc.py --sd.prompt "A zoomed out high quality photo of Sydney Opera House" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
141
+ ```
142
+ ### Rose
143
+ ```
144
+ python /path/to/sjc/run_sjc.py --sd.prompt "a DSLR photo of a rose" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 50
145
+ ```
146
+ ### School Bus
147
+ ```
148
+ python /path/to/sjc/run_sjc.py --sd.prompt "A high quality photo of a yellow school bus" --n_steps 30000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0 --var_red False
149
+ ```
150
+ ### Rocket
151
+ ```
152
+ python /path/to/sjc/run_sjc.py --sd.prompt "A wide angle zoomed out photo of Saturn V rocket from distance" --n_steps 30000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0 --var_red False
153
+ ```
154
+ ### French Fries
155
+ ```
156
+ python /path/to/sjc/run_sjc.py --sd.prompt "A high quality photo of french fries from McDonald's" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 10
157
+ ```
158
+ ### Motorcycle
159
+ ```
160
+ python /path/to/sjc/run_sjc.py --sd.prompt "A high quality photo of a toy motorcycle" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
161
+ ```
162
+ ### Car
163
+ ```
164
+ python /path/to/sjc/run_sjc.py --sd.prompt "A high quality photo of a classic silver muscle car" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
165
+ ```
166
+ ### Tank
167
+ ```
168
+ python /path/to/sjc/run_sjc.py --sd.prompt "A product photo of a toy tank" --n_steps 20000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
169
+ ```
170
+ ### Chair
171
+ ```
172
+ python /path/to/sjc/run_sjc.py --sd.prompt "A high quality photo of a Victorian style wooden chair with velvet upholstery" --n_steps 50000 --lr 0.01 --sd.scale 100.0 --emptiness_weight 7000
173
+ ```
174
+ ### Duck
175
+ ```
176
+ python /path/to/sjc/run_sjc.py --sd.prompt "a DSLR photo of a yellow duck" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 10
177
+ ```
178
+ ### Horse
179
+ ```
180
+ python /path/to/sjc/run_sjc.py --sd.prompt "A photo of a horse walking" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
181
+ ```
182
+ ### Giraffe
183
+ ```
184
+ python /path/to/sjc/run_sjc.py --sd.prompt "A wide angle zoomed out photo of a giraffe" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 50
185
+ ```
186
+ ### Zebra
187
+ ```
188
+ python /path/to/sjc/run_sjc.py --sd.prompt "A photo of a zebra walking" --n_steps 10000 --lr 0.02 --sd.scale 100.0 --emptiness_weight 30000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0 --var_red False
189
+ ```
190
+ ### Printer
191
+ ```
192
+ python /path/to/sjc/run_sjc.py --sd.prompt "A product photo of a Canon home printer" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0 --var_red False
193
+ ```
194
+ ### Zelda Link
195
+ ```
196
+ python /path/to/sjc/run_sjc.py --sd.prompt "Zelda Link" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0 --var_red False
197
+ ```
198
+ ### Pig
199
+ ```
200
+ python /path/to/sjc/run_sjc.py --sd.prompt "A pig" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
201
+ ```
202
+
203
+
204
+ ## To Test the Voxel NeRF
205
+ ```
206
+ python /path/to/sjc/run_nerf.py
207
+ ```
208
+ Our bundle contains a tar ball for the lego bulldozer dataset. Untar it and it will work.
209
+
210
+ ## To Sample 2D images with the Karras Sampler
211
+ ```
212
+ python /path/to/sjc/run_img_sampling.py
213
+ ```
214
+ Use help -h to see the options available. Will expand the details later.
215
+
216
+
217
+ ## Bib
218
+ ```
219
+ @article{sjc,
220
+ title={Score Jacobian Chaining: Lifting Pretrained 2D Diffusion Models for 3D Generation},
221
+ author={Wang, Haochen and Du, Xiaodan and Li, Jiahao and Yeh, Raymond A. and Shakhnarovich, Greg},
222
+ journal={arXiv preprint arXiv:2212.00774},
223
+ year={2022},
224
+ }
225
+ ```
README.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Score Jacobian Chaining
3
+ emoji: 🧊
4
+ colorFrom: red
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 3.12.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: creativeml-openrail-m
11
+ duplicated_from: MirageML/sjc
12
+ ---
13
+
14
+ ## Bib
15
+ ```
16
+ @article{sjc,
17
+ title={Score Jacobian Chaining: Lifting Pretrained 2D Diffusion Models for 3D Generation},
18
+ author={Wang, Haochen and Du, Xiaodan and Li, Jiahao and Yeh, Raymond A. and Shakhnarovich, Greg},
19
+ journal={arXiv preprint arXiv:2212.00774},
20
+ year={2022},
21
+ }
22
+ ```
adapt.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import json
3
+ from math import sqrt
4
+ import numpy as np
5
+ import torch
6
+ from abc import ABCMeta, abstractmethod
7
+
8
+
9
+ class ScoreAdapter(metaclass=ABCMeta):
10
+
11
+ @abstractmethod
12
+ def denoise(self, xs, σ, **kwargs):
13
+ pass
14
+
15
+ def score(self, xs, σ, **kwargs):
16
+ Ds = self.denoise(xs, σ, **kwargs)
17
+ grad_log_p_t = (Ds - xs) / (σ ** 2)
18
+ return grad_log_p_t
19
+
20
+ @abstractmethod
21
+ def data_shape(self):
22
+ return (3, 256, 256) # for example
23
+
24
+ def samps_centered(self):
25
+ # if centered, samples expected to be in range [-1, 1], else [0, 1]
26
+ return True
27
+
28
+ @property
29
+ @abstractmethod
30
+ def σ_max(self):
31
+ pass
32
+
33
+ @property
34
+ @abstractmethod
35
+ def σ_min(self):
36
+ pass
37
+
38
+ def cond_info(self, batch_size):
39
+ return {}
40
+
41
+ @abstractmethod
42
+ def unet_is_cond(self):
43
+ return False
44
+
45
+ @abstractmethod
46
+ def use_cls_guidance(self):
47
+ return False # most models do not use cls guidance
48
+
49
+ def classifier_grad(self, xs, σ, ys):
50
+ raise NotImplementedError()
51
+
52
+ @abstractmethod
53
+ def snap_t_to_nearest_tick(self, t):
54
+ # need to confirm for each model; continuous time model doesn't need this
55
+ return t, None
56
+
57
+ @property
58
+ def device(self):
59
+ return self._device
60
+
61
+ def checkpoint_root(self):
62
+ """the path at which the pretrained checkpoints are stored"""
63
+ with Path(__file__).resolve().with_name("env.json").open("r") as f:
64
+ root = json.load(f)['data_root']
65
+ root = Path(root) / "diffusion_ckpts"
66
+ return root
67
+
68
+
69
+ def karras_t_schedule(ρ=7, N=10, σ_max=80, σ_min=0.002):
70
+ ts = []
71
+ for i in range(N):
72
+
73
+ t = (
74
+ σ_max ** (1 / ρ) + (i / (N - 1)) * (σ_min ** (1 / ρ) - σ_max ** (1 / ρ))
75
+ ) ** ρ
76
+ ts.append(t)
77
+ return ts
78
+
79
+
80
+ def power_schedule(σ_max, σ_min, num_stages):
81
+ σs = np.exp(np.linspace(np.log(σ_max), np.log(σ_min), num_stages))
82
+ return σs
83
+
84
+
85
+ class Karras():
86
+
87
+ @classmethod
88
+ @torch.no_grad()
89
+ def inference(
90
+ cls, model, batch_size, num_t, *,
91
+ σ_max=80, cls_scaling=1,
92
+ init_xs=None, heun=True,
93
+ langevin=False,
94
+ S_churn=80, S_min=0.05, S_max=50, S_noise=1.003,
95
+ ):
96
+ σ_max = min(σ_max, model.σ_max)
97
+ σ_min = model.σ_min
98
+ ts = karras_t_schedule(ρ=7, N=num_t, σ_max=σ_max, σ_min=σ_min)
99
+ assert len(ts) == num_t
100
+ ts = [model.snap_t_to_nearest_tick(t)[0] for t in ts]
101
+ ts.append(0) # 0 is the destination
102
+ σ_max = ts[0]
103
+
104
+ cond_inputs = model.cond_info(batch_size)
105
+
106
+ def compute_step(xs, σ):
107
+ grad_log_p_t = model.score(
108
+ xs, σ, **(cond_inputs if model.unet_is_cond() else {})
109
+ )
110
+ if model.use_cls_guidance():
111
+ grad_cls = model.classifier_grad(xs, σ, cond_inputs["y"])
112
+ grad_cls = grad_cls * cls_scaling
113
+ grad_log_p_t += grad_cls
114
+ d_i = -1 * σ * grad_log_p_t
115
+ return d_i
116
+
117
+ if init_xs is not None:
118
+ xs = init_xs.to(model.device)
119
+ else:
120
+ xs = σ_max * torch.randn(
121
+ batch_size, *model.data_shape(), device=model.device
122
+ )
123
+
124
+ yield xs
125
+
126
+ for i in range(num_t):
127
+ t_i = ts[i]
128
+
129
+ if langevin and (S_min < t_i and t_i < S_max):
130
+ xs, t_i = cls.noise_backward_in_time(
131
+ model, xs, t_i, S_noise, S_churn / num_t
132
+ )
133
+
134
+ Δt = ts[i+1] - t_i
135
+
136
+ d_1 = compute_step(xs, σ=t_i)
137
+ xs_1 = xs + Δt * d_1
138
+
139
+ # Heun's 2nd order method; don't apply on the last step
140
+ if (not heun) or (ts[i+1] == 0):
141
+ xs = xs_1
142
+ else:
143
+ d_2 = compute_step(xs_1, σ=ts[i+1])
144
+ xs = xs + Δt * (d_1 + d_2) / 2
145
+
146
+ yield xs
147
+
148
+ @staticmethod
149
+ def noise_backward_in_time(model, xs, t_i, S_noise, S_churn_i):
150
+ n = S_noise * torch.randn_like(xs)
151
+ γ_i = min(sqrt(2)-1, S_churn_i)
152
+ t_i_hat = t_i * (1 + γ_i)
153
+ t_i_hat = model.snap_t_to_nearest_tick(t_i_hat)[0]
154
+ xs = xs + n * sqrt(t_i_hat ** 2 - t_i ** 2)
155
+ return xs, t_i_hat
156
+
157
+
158
+ def test():
159
+ pass
160
+
161
+
162
+ if __name__ == "__main__":
163
+ test()
adapt_gddpm.py ADDED
@@ -0,0 +1,562 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from math import sin, pi, sqrt
3
+ from functools import partial
4
+
5
+ import numpy as np
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+
10
+ from easydict import EasyDict
11
+ from guided_diffusion.script_util import (
12
+ create_model_and_diffusion,
13
+ model_and_diffusion_defaults,
14
+
15
+ NUM_CLASSES,
16
+ create_classifier,
17
+ classifier_defaults,
18
+
19
+ sr_create_model_and_diffusion,
20
+ sr_model_and_diffusion_defaults,
21
+ )
22
+
23
+ from adapt import ScoreAdapter
24
+
25
+ from my.registry import Registry
26
+
27
+ PRETRAINED_REGISTRY = Registry("pretrained")
28
+
29
+
30
+ device = torch.device("cuda")
31
+
32
+
33
+ def load_ckpt(path, **kwargs):
34
+ # with bf.BlobFile(path, "rb") as f:
35
+ # data = f.read()
36
+ return torch.load(path, **kwargs)
37
+
38
+
39
+ def pick_out_cfgs(src, target_ks):
40
+ return {k: src[k] for k in target_ks}
41
+
42
+
43
+ @PRETRAINED_REGISTRY.register()
44
+ def m_imgnet_64():
45
+ return dict(
46
+ attention_resolutions="32,16,8",
47
+ class_cond=True,
48
+ diffusion_steps=1000,
49
+ dropout=0.1,
50
+ image_size=64,
51
+ learn_sigma=True,
52
+ noise_schedule="cosine",
53
+ num_channels=192,
54
+ num_head_channels=64,
55
+ num_res_blocks=3,
56
+ resblock_updown=True,
57
+ use_new_attention_order=True,
58
+ use_fp16=True,
59
+ use_scale_shift_norm=True,
60
+
61
+ classifier_depth=4,
62
+
63
+ classifier_scale=1.0,
64
+ model_path="models/64x64_diffusion.pt",
65
+ classifier_path="models/64x64_classifier.pt",
66
+ )
67
+
68
+
69
+ @PRETRAINED_REGISTRY.register()
70
+ def m_imgnet_128():
71
+ return dict(
72
+ attention_resolutions="32,16,8",
73
+ class_cond=True,
74
+ diffusion_steps=1000,
75
+ image_size=128,
76
+ learn_sigma=True,
77
+ noise_schedule="linear",
78
+ num_channels=256,
79
+ num_heads=4,
80
+ num_res_blocks=2,
81
+ resblock_updown=True,
82
+ use_fp16=True,
83
+ use_scale_shift_norm=True,
84
+
85
+ classifier_scale=0.5,
86
+ model_path="models/128x128_diffusion.pt",
87
+ classifier_path="models/128x128_classifier.pt",
88
+ )
89
+
90
+
91
+ @PRETRAINED_REGISTRY.register()
92
+ def m_imgnet_256():
93
+ return dict(
94
+ attention_resolutions="32,16,8",
95
+ class_cond=True,
96
+ diffusion_steps=1000,
97
+ image_size=256,
98
+ learn_sigma=True,
99
+ noise_schedule="linear",
100
+ num_channels=256,
101
+ num_head_channels=64,
102
+ num_res_blocks=2,
103
+ resblock_updown=True,
104
+ use_fp16=True,
105
+ use_scale_shift_norm=True,
106
+
107
+ classifier_scale=1.0,
108
+ model_path="models/256x256_diffusion.pt",
109
+ classifier_path="models/256x256_classifier.pt"
110
+ )
111
+
112
+
113
+ @PRETRAINED_REGISTRY.register()
114
+ def m_imgnet_256_uncond():
115
+ return dict(
116
+ attention_resolutions="32,16,8",
117
+ class_cond=False,
118
+ diffusion_steps=1000,
119
+ image_size=256,
120
+ learn_sigma=True,
121
+ noise_schedule="linear",
122
+ num_channels=256,
123
+ num_head_channels=64,
124
+ num_res_blocks=2,
125
+ resblock_updown=True,
126
+ use_fp16=True,
127
+ use_scale_shift_norm=True,
128
+
129
+ classifier_scale=10.0,
130
+ model_path="models/256x256_diffusion_uncond.pt",
131
+ classifier_path="models/256x256_classifier.pt",
132
+ )
133
+
134
+
135
+ @PRETRAINED_REGISTRY.register()
136
+ def m_imgnet_512():
137
+ return dict(
138
+ attention_resolutions="32,16,8",
139
+ class_cond=True,
140
+ diffusion_steps=1000,
141
+ image_size=512,
142
+ learn_sigma=True,
143
+ noise_schedule="linear",
144
+ num_channels=256,
145
+ num_head_channels=64,
146
+ num_res_blocks=2,
147
+ resblock_updown=True,
148
+ use_fp16=False,
149
+ use_scale_shift_norm=True,
150
+
151
+ classifier_scale=4.0,
152
+ model_path="models/512x512_diffusion.pt",
153
+ classifier_path="models/512x512_classifier.pt"
154
+ )
155
+
156
+
157
+ @PRETRAINED_REGISTRY.register()
158
+ def m_imgnet_64_256(base_samples="64_samples.npz"):
159
+ return dict(
160
+ attention_resolutions="32,16,8",
161
+ class_cond=True,
162
+ diffusion_steps=1000,
163
+ large_size=256,
164
+ small_size=64,
165
+ learn_sigma=True,
166
+ noise_schedule="linear",
167
+ num_channels=192,
168
+ num_heads=4,
169
+ num_res_blocks=2,
170
+ resblock_updown=True,
171
+ use_fp16=True,
172
+ use_scale_shift_norm=True,
173
+
174
+ model_path="models/64_256_upsampler.pt",
175
+
176
+ base_samples=base_samples,
177
+ )
178
+
179
+
180
+ @PRETRAINED_REGISTRY.register()
181
+ def m_imgnet_128_512(base_samples="128_samples.npz",):
182
+ return dict(
183
+ attention_resolutions="32,16",
184
+ class_cond=True,
185
+ diffusion_steps=1000,
186
+ large_size=512,
187
+ small_size=128,
188
+ learn_sigma=True,
189
+ noise_schedule="linear",
190
+ num_channels=192,
191
+ num_head_channels=64,
192
+ num_res_blocks=2,
193
+ resblock_updown=True,
194
+ use_fp16=True,
195
+ use_scale_shift_norm=True,
196
+
197
+ model_path="models/128_512_upsampler.pt",
198
+
199
+ base_samples=base_samples,
200
+ )
201
+
202
+
203
+ @PRETRAINED_REGISTRY.register()
204
+ def m_lsun_256(category="bedroom"):
205
+ return dict(
206
+ attention_resolutions="32,16,8",
207
+ class_cond=False,
208
+ diffusion_steps=1000,
209
+ dropout=0.1,
210
+ image_size=256,
211
+ learn_sigma=True,
212
+ noise_schedule="linear",
213
+ num_channels=256,
214
+ num_head_channels=64,
215
+ num_res_blocks=2,
216
+ resblock_updown=True,
217
+ use_fp16=True,
218
+ use_scale_shift_norm=True,
219
+
220
+ model_path=f"models/lsun_{category}.pt"
221
+ )
222
+
223
+
224
+ def img_gen(specific_cfgs, num_samples=16, batch_size=16, load_only=False, ckpt_root=Path("")):
225
+ cfgs = EasyDict(
226
+ clip_denoised=True,
227
+ num_samples=num_samples,
228
+ batch_size=batch_size,
229
+ use_ddim=False,
230
+ model_path="",
231
+ classifier_path="",
232
+ classifier_scale=1.0,
233
+ )
234
+ cfgs.update(model_and_diffusion_defaults())
235
+ cfgs.update(classifier_defaults())
236
+ cfgs.update(specific_cfgs)
237
+
238
+ use_classifier_guidance = bool(cfgs.classifier_path)
239
+ class_aware = cfgs.class_cond or use_classifier_guidance
240
+
241
+ model, diffusion = create_model_and_diffusion(
242
+ **pick_out_cfgs(cfgs, model_and_diffusion_defaults().keys())
243
+ )
244
+ model.load_state_dict(
245
+ load_ckpt(str(ckpt_root / cfgs.model_path), map_location="cpu")
246
+ )
247
+ model.to(device)
248
+ if cfgs.use_fp16:
249
+ model.convert_to_fp16()
250
+ model.eval()
251
+
252
+ def model_fn(x, t, y=None):
253
+ return model(x, t, y if cfgs.class_cond else None)
254
+
255
+ classifier = None
256
+ cond_fn = None
257
+ if use_classifier_guidance:
258
+ classifier = create_classifier(
259
+ **pick_out_cfgs(cfgs, classifier_defaults().keys())
260
+ )
261
+ classifier.load_state_dict(
262
+ load_ckpt(str(ckpt_root / cfgs.classifier_path), map_location="cpu")
263
+ )
264
+ classifier.to(device)
265
+ if cfgs.classifier_use_fp16:
266
+ classifier.convert_to_fp16()
267
+ classifier.eval()
268
+
269
+ def cond_fn(x, t, y=None):
270
+ assert y is not None
271
+ with torch.enable_grad():
272
+ x_in = x.detach().requires_grad_(True)
273
+ logits = classifier(x_in, t)
274
+ log_probs = F.log_softmax(logits, dim=-1)
275
+ selected = log_probs[range(len(logits)), y.view(-1)]
276
+ return torch.autograd.grad(selected.sum(), x_in)[0] * cfgs.classifier_scale
277
+
278
+ if load_only:
279
+ return model, classifier
280
+
281
+ all_images = []
282
+ all_labels = []
283
+
284
+ while len(all_images) * cfgs.batch_size < cfgs.num_samples:
285
+ model_kwargs = {}
286
+
287
+ if class_aware:
288
+ classes = torch.randint(
289
+ low=0, high=NUM_CLASSES, size=(cfgs.batch_size,), device=device
290
+ )
291
+ model_kwargs["y"] = classes
292
+
293
+ sample_fn = (
294
+ diffusion.p_sample_loop if not cfgs.use_ddim else diffusion.ddim_sample_loop
295
+ )
296
+ sample = sample_fn(
297
+ model_fn,
298
+ (cfgs.batch_size, 3, cfgs.image_size, cfgs.image_size),
299
+ clip_denoised=cfgs.clip_denoised,
300
+ model_kwargs=model_kwargs,
301
+ cond_fn=cond_fn,
302
+ device=device,
303
+ progress=True
304
+ )
305
+ sample = ((sample + 1) * 127.5).clamp(0, 255).to(torch.uint8)
306
+ sample = sample.permute(0, 2, 3, 1)
307
+ sample = sample.contiguous()
308
+
309
+ all_images.append(sample.cpu().numpy())
310
+ if class_aware:
311
+ all_labels.append(classes.cpu().numpy())
312
+
313
+ arr = np.concatenate(all_images, axis=0)
314
+ arr = arr[:cfgs.num_samples]
315
+
316
+ if class_aware:
317
+ all_labels = np.concatenate(all_labels, axis=0)
318
+ all_labels = all_labels[:cfgs.num_samples]
319
+
320
+ shape_str = "x".join([str(x) for x in arr.shape])
321
+ out_path = Path("./out") / f"samples_{shape_str}.npz"
322
+ np.savez(out_path, arr, all_labels)
323
+
324
+
325
+ def img_upsamp(specific_cfgs, num_samples=16, batch_size=16, load_only=False):
326
+ """note that here the ckpt root is not configured properly; will break but easy fix"""
327
+ cfgs = EasyDict(
328
+ clip_denoised=True,
329
+ num_samples=num_samples,
330
+ batch_size=batch_size,
331
+ use_ddim=False,
332
+ base_samples="",
333
+ model_path="",
334
+ )
335
+ cfgs.update(sr_model_and_diffusion_defaults())
336
+ cfgs.update(specific_cfgs)
337
+
338
+ model, diffusion = sr_create_model_and_diffusion(
339
+ **pick_out_cfgs(cfgs, sr_model_and_diffusion_defaults().keys())
340
+ )
341
+ model.load_state_dict(load_ckpt(cfgs.model_path, map_location="cpu"))
342
+ model.to(device)
343
+ if cfgs.use_fp16:
344
+ model.convert_to_fp16()
345
+ model.eval()
346
+
347
+ if load_only:
348
+ return model
349
+
350
+ data = load_low_res_samples(
351
+ cfgs.base_samples, cfgs.batch_size, cfgs.class_cond
352
+ )
353
+
354
+ all_images = []
355
+ while len(all_images) * cfgs.batch_size < cfgs.num_samples:
356
+ model_kwargs = next(data)
357
+ model_kwargs = {k: v.to(device) for k, v in model_kwargs.items()}
358
+ samples = diffusion.p_sample_loop(
359
+ model,
360
+ (cfgs.batch_size, 3, cfgs.large_size, cfgs.large_size),
361
+ clip_denoised=cfgs.clip_denoised,
362
+ model_kwargs=model_kwargs,
363
+ progress=True
364
+ )
365
+ samples = ((samples + 1) * 127.5).clamp(0, 255).to(torch.uint8)
366
+ samples = samples.permute(0, 2, 3, 1)
367
+ samples = samples.contiguous()
368
+
369
+ all_images.append(samples.cpu().numpy())
370
+
371
+ arr = np.concatenate(all_images, axis=0)
372
+ arr = arr[: cfgs.num_samples]
373
+
374
+ shape_str = "x".join([str(x) for x in arr.shape])
375
+ out_path = Path("./out") / f"samples_{shape_str}.npz"
376
+ np.savez(out_path, arr)
377
+
378
+
379
+ def load_low_res_samples(base_samples, batch_size, class_cond):
380
+ obj = np.load(base_samples)
381
+ image_arr = obj["arr_0"]
382
+ if class_cond:
383
+ label_arr = obj["arr_1"]
384
+
385
+ buffer = []
386
+ label_buffer = []
387
+ while True:
388
+ for i in range(len(image_arr)):
389
+ buffer.append(image_arr[i])
390
+ if class_cond:
391
+ label_buffer.append(label_arr[i])
392
+
393
+ if len(buffer) == batch_size:
394
+ batch = torch.from_numpy(np.stack(buffer)).float()
395
+ batch = batch / 127.5 - 1.0
396
+ batch = batch.permute(0, 3, 1, 2)
397
+ res = {}
398
+ res["low_res"] = batch
399
+ if class_cond:
400
+ res["y"] = torch.from_numpy(np.stack(label_buffer))
401
+ yield res
402
+ buffer, label_buffer = [], []
403
+
404
+
405
+ def class_cond_info(imgnet_cat):
406
+
407
+ def rand_cond_fn(batch_size):
408
+ cats = torch.randint(
409
+ low=0, high=NUM_CLASSES, size=(batch_size,), device=device
410
+ )
411
+ return {"y": cats}
412
+
413
+ def class_specific_cond(batch_size):
414
+ cats = torch.tensor([imgnet_cat, ] * batch_size, device=device)
415
+ return {"y": cats}
416
+
417
+ if imgnet_cat == -1:
418
+ return rand_cond_fn
419
+ else:
420
+ return class_specific_cond
421
+
422
+
423
+ def _sqrt(x):
424
+ if isinstance(x, float):
425
+ return sqrt(x)
426
+ else:
427
+ assert isinstance(x, torch.Tensor)
428
+ return torch.sqrt(x)
429
+
430
+
431
+ class GuidedDDPM(ScoreAdapter):
432
+ def __init__(self, model, lsun_cat, imgnet_cat):
433
+ print(PRETRAINED_REGISTRY)
434
+ cfgs = PRETRAINED_REGISTRY.get(model)(
435
+ **({"category": lsun_cat} if model.startswith("m_lsun") else {})
436
+ )
437
+
438
+ self.unet, self.classifier = img_gen(
439
+ cfgs, load_only=True, ckpt_root=self.checkpoint_root() / "guided_ddpm"
440
+ )
441
+
442
+ H, W = cfgs['image_size'], cfgs['image_size']
443
+ self._data_shape = (3, H, W)
444
+
445
+ if cfgs['class_cond'] or (self.classifier is not None):
446
+ cond_func = class_cond_info(imgnet_cat)
447
+ else:
448
+ cond_func = lambda *args, **kwargs: {}
449
+ self.cond_func = cond_func
450
+
451
+ self._unet_is_cond = bool(cfgs['class_cond'])
452
+
453
+ noise_schedule = cfgs['noise_schedule']
454
+ assert noise_schedule in ("linear", "cosine")
455
+ self.M = 1000
456
+ if noise_schedule == "linear":
457
+ self.us = self.linear_us(self.M)
458
+ self._σ_min = 0.01
459
+ else:
460
+ self.us = self.cosine_us(self.M)
461
+ self._σ_min = 0.0064
462
+ self.noise_schedule = noise_schedule
463
+
464
+ self._device = next(self.unet.parameters()).device
465
+
466
+ def data_shape(self):
467
+ return self._data_shape
468
+
469
+ @property
470
+ def σ_max(self):
471
+ return self.us[0]
472
+
473
+ @property
474
+ def σ_min(self):
475
+ return self.us[-1]
476
+
477
+ @torch.no_grad()
478
+ def denoise(self, xs, σ, **model_kwargs):
479
+ N = xs.shape[0]
480
+ cond_t, σ = self.time_cond_vec(N, σ)
481
+ output = self.unet(
482
+ xs / _sqrt(1 + σ**2), cond_t, **model_kwargs
483
+ )
484
+ # not using the var pred
485
+ n_hat = torch.split(output, xs.shape[1], dim=1)[0]
486
+ Ds = xs - σ * n_hat
487
+ return Ds
488
+
489
+ def cond_info(self, batch_size):
490
+ return self.cond_func(batch_size)
491
+
492
+ def unet_is_cond(self):
493
+ return self._unet_is_cond
494
+
495
+ def use_cls_guidance(self):
496
+ return (self.classifier is not None)
497
+
498
+ @torch.no_grad()
499
+ def classifier_grad(self, xs, σ, ys):
500
+ N = xs.shape[0]
501
+ cond_t, σ = self.time_cond_vec(N, σ)
502
+ with torch.enable_grad():
503
+ x_in = xs.detach().requires_grad_(True)
504
+ logits = self.classifier(x_in, cond_t)
505
+ log_probs = F.log_softmax(logits, dim=-1)
506
+ selected = log_probs[range(len(logits)), ys.view(-1)]
507
+ grad = torch.autograd.grad(selected.sum(), x_in)[0]
508
+
509
+ grad = grad * (1 / sqrt(1 + σ**2))
510
+ return grad
511
+
512
+ def snap_t_to_nearest_tick(self, t):
513
+ j = np.abs(t - self.us).argmin()
514
+ return self.us[j], j
515
+
516
+ def time_cond_vec(self, N, σ):
517
+ if isinstance(σ, float):
518
+ σ, j = self.snap_t_to_nearest_tick(σ) # σ might change due to snapping
519
+ cond_t = (self.M - 1) - j
520
+ cond_t = torch.tensor([cond_t] * N, device=self.device)
521
+ return cond_t, σ
522
+ else:
523
+ assert isinstance(σ, torch.Tensor)
524
+ σ = σ.reshape(-1).cpu().numpy()
525
+ σs = []
526
+ js = []
527
+ for elem in σ:
528
+ _σ, _j = self.snap_t_to_nearest_tick(elem)
529
+ σs.append(_σ)
530
+ js.append((self.M - 1) - _j)
531
+
532
+ cond_t = torch.tensor(js, device=self.device)
533
+ σs = torch.tensor(σs, device=self.device, dtype=torch.float32).reshape(-1, 1, 1, 1)
534
+ return cond_t, σs
535
+
536
+ @staticmethod
537
+ def cosine_us(M=1000):
538
+ assert M == 1000
539
+
540
+ def α_bar(j):
541
+ return sin(pi / 2 * j / (M * (0.008 + 1))) ** 2
542
+
543
+ us = [0, ]
544
+ for j in reversed(range(0, M)): # [M-1, 0], inclusive
545
+ u_j = sqrt(((us[-1] ** 2) + 1) / (max(α_bar(j) / α_bar(j+1), 0.001)) - 1)
546
+ us.append(u_j)
547
+
548
+ us = np.array(us)
549
+ us = us[1:]
550
+ us = us[::-1]
551
+ return us
552
+
553
+ @staticmethod
554
+ def linear_us(M=1000):
555
+ assert M == 1000
556
+ β_start = 0.0001
557
+ β_end = 0.02
558
+ βs = np.linspace(β_start, β_end, M, dtype=np.float64)
559
+ αs = np.cumprod(1 - βs)
560
+ us = np.sqrt((1 - αs) / αs)
561
+ us = us[::-1]
562
+ return us
adapt_ncsn.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import argparse
3
+ import yaml
4
+
5
+ import numpy as np
6
+ import torch
7
+
8
+ from ncsn.ncsnv2 import NCSNv2, NCSNv2Deeper, NCSNv2Deepest, get_sigmas
9
+ from ncsn.ema import EMAHelper
10
+
11
+ from adapt import ScoreAdapter
12
+
13
+ device = torch.device("cuda")
14
+
15
+
16
+ def get_model(config):
17
+ if config.data.dataset == 'CIFAR10' or config.data.dataset == 'CELEBA':
18
+ return NCSNv2(config).to(config.device)
19
+ elif config.data.dataset == "FFHQ":
20
+ return NCSNv2Deepest(config).to(config.device)
21
+ elif config.data.dataset == 'LSUN':
22
+ return NCSNv2Deeper(config).to(config.device)
23
+
24
+
25
+ def dict2namespace(config):
26
+ namespace = argparse.Namespace()
27
+ for key, value in config.items():
28
+ if isinstance(value, dict):
29
+ new_value = dict2namespace(value)
30
+ else:
31
+ new_value = value
32
+ setattr(namespace, key, new_value)
33
+ return namespace
34
+
35
+
36
+ class NCSN(ScoreAdapter):
37
+ def __init__(self):
38
+ config_fname = Path(__file__).resolve().parent / "ncsn" / "bedroom.yml"
39
+ with config_fname.open("r") as f:
40
+ config = yaml.safe_load(f)
41
+ config = dict2namespace(config)
42
+
43
+ config.device = device
44
+
45
+ states = torch.load(
46
+ self.checkpoint_root() / "ncsn/exp/logs/bedroom/checkpoint_150000.pth"
47
+ )
48
+
49
+ model = get_model(config)
50
+ model = torch.nn.DataParallel(model)
51
+ model.load_state_dict(states[0], strict=True)
52
+
53
+ if config.model.ema:
54
+ ema_helper = EMAHelper(mu=config.model.ema_rate)
55
+ ema_helper.register(model)
56
+ ema_helper.load_state_dict(states[-1])
57
+ # HC: update the model param with history ema.
58
+ # if don't do this the colors of images become strangely saturated.
59
+ # this is reported in the paper.
60
+ ema_helper.ema(model)
61
+
62
+ model = model.module # remove DataParallel
63
+ model.eval()
64
+ self.model = model
65
+ self._data_shape = (3, config.data.image_size, config.data.image_size)
66
+
67
+ self.σs = model.sigmas.cpu().numpy()
68
+ self._device = device
69
+
70
+ def data_shape(self):
71
+ return self._data_shape
72
+
73
+ def samps_centered(self):
74
+ return False
75
+
76
+ @property
77
+ def σ_max(self):
78
+ return self.σs[0]
79
+
80
+ @property
81
+ def σ_min(self):
82
+ return self.σs[-1]
83
+
84
+ @torch.no_grad()
85
+ def denoise(self, xs, σ):
86
+ σ, j = self.snap_t_to_nearest_tick(σ)
87
+ N = xs.shape[0]
88
+ cond_t = torch.tensor([j] * N, dtype=torch.long, device=self.device)
89
+ score = self.model(xs, cond_t)
90
+ Ds = xs + score * (σ ** 2)
91
+ return Ds
92
+
93
+ def unet_is_cond(self):
94
+ return False
95
+
96
+ def use_cls_guidance(self):
97
+ return False
98
+
99
+ def snap_t_to_nearest_tick(self, t):
100
+ j = np.abs(t - self.σs).argmin()
101
+ return self.σs[j], j
adapt_sd.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from pathlib import Path
3
+ import torch
4
+ import numpy as np
5
+ from omegaconf import OmegaConf
6
+ from einops import rearrange
7
+
8
+ from torch import autocast
9
+ from contextlib import nullcontext
10
+ from math import sqrt
11
+ from adapt import ScoreAdapter
12
+
13
+ import warnings
14
+ from transformers import logging
15
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
16
+ logging.set_verbosity_error()
17
+
18
+
19
+ device = torch.device("cuda")
20
+
21
+
22
+ def curr_dir():
23
+ return Path(__file__).resolve().parent
24
+
25
+
26
+ def add_import_path(dirname):
27
+ sys.path.append(str(
28
+ curr_dir() / str(dirname)
29
+ ))
30
+
31
+
32
+ def load_model_from_config(config, ckpt, verbose=False):
33
+ from ldm.util import instantiate_from_config
34
+ print(f"Loading model from {ckpt}")
35
+ pl_sd = torch.load(ckpt, map_location="cpu")
36
+ if "global_step" in pl_sd:
37
+ print(f"Global Step: {pl_sd['global_step']}")
38
+ sd = pl_sd["state_dict"]
39
+ model = instantiate_from_config(config.model)
40
+ m, u = model.load_state_dict(sd, strict=False)
41
+ if len(m) > 0 and verbose:
42
+ print("missing keys:")
43
+ print(m)
44
+ if len(u) > 0 and verbose:
45
+ print("unexpected keys:")
46
+ print(u)
47
+
48
+ model.to(device)
49
+ model.eval()
50
+ return model
51
+
52
+
53
+ def load_sd1_model(ckpt_root):
54
+ ckpt_fname = ckpt_root / "stable_diffusion" / "sd-v1-5.ckpt"
55
+ cfg_fname = curr_dir() / "sd1" / "configs" / "v1-inference.yaml"
56
+ H, W = 512, 512
57
+
58
+ config = OmegaConf.load(str(cfg_fname))
59
+ model = load_model_from_config(config, str(ckpt_fname))
60
+ return model, H, W
61
+
62
+
63
+ def load_sd2_model(ckpt_root, v2_highres):
64
+ if v2_highres:
65
+ ckpt_fname = ckpt_root / "sd2" / "768-v-ema.ckpt"
66
+ cfg_fname = curr_dir() / "sd2/configs/stable-diffusion/v2-inference-v.yaml"
67
+ H, W = 768, 768
68
+ else:
69
+ ckpt_fname = ckpt_root / "sd2" / "512-base-ema.ckpt"
70
+ cfg_fname = curr_dir() / "sd2/configs/stable-diffusion/v2-inference.yaml"
71
+ H, W = 512, 512
72
+
73
+ config = OmegaConf.load(f"{cfg_fname}")
74
+ model = load_model_from_config(config, str(ckpt_fname))
75
+ return model, H, W
76
+
77
+
78
+ def _sqrt(x):
79
+ if isinstance(x, float):
80
+ return sqrt(x)
81
+ else:
82
+ assert isinstance(x, torch.Tensor)
83
+ return torch.sqrt(x)
84
+
85
+
86
+ class StableDiffusion(ScoreAdapter):
87
+ def __init__(self, variant, v2_highres, prompt, scale, precision):
88
+ if variant == "v1":
89
+ add_import_path("sd1")
90
+ self.model, H, W = load_sd1_model(self.checkpoint_root())
91
+ elif variant == "v2":
92
+ add_import_path("sd2")
93
+ self.model, H, W = load_sd2_model(self.checkpoint_root(), v2_highres)
94
+ else:
95
+ raise ValueError(f"{variant}")
96
+
97
+ ae_resolution_f = 8
98
+
99
+ self._device = self.model._device
100
+
101
+ self.prompt = prompt
102
+ self.scale = scale
103
+ self.precision = precision
104
+ self.precision_scope = autocast if self.precision == "autocast" else nullcontext
105
+ self._data_shape = (4, H // ae_resolution_f, W // ae_resolution_f)
106
+
107
+ self.cond_func = self.model.get_learned_conditioning
108
+ self.M = 1000
109
+ noise_schedule = "linear"
110
+ self.noise_schedule = noise_schedule
111
+ self.us = self.linear_us(self.M)
112
+
113
+ def data_shape(self):
114
+ return self._data_shape
115
+
116
+ @property
117
+ def σ_max(self):
118
+ return self.us[0]
119
+
120
+ @property
121
+ def σ_min(self):
122
+ return self.us[-1]
123
+
124
+ @torch.no_grad()
125
+ def denoise(self, xs, σ, **model_kwargs):
126
+ with self.precision_scope("cuda"):
127
+ with self.model.ema_scope():
128
+ N = xs.shape[0]
129
+ c = model_kwargs.pop('c')
130
+ uc = model_kwargs.pop('uc')
131
+ cond_t, σ = self.time_cond_vec(N, σ)
132
+ unscaled_xs = xs
133
+ xs = xs / _sqrt(1 + σ**2)
134
+ if uc is None or self.scale == 1.:
135
+ output = self.model.apply_model(xs, cond_t, c)
136
+ else:
137
+ x_in = torch.cat([xs] * 2)
138
+ t_in = torch.cat([cond_t] * 2)
139
+ c_in = torch.cat([uc, c])
140
+ e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
141
+ output = e_t_uncond + self.scale * (e_t - e_t_uncond)
142
+
143
+ if self.model.parameterization == "v":
144
+ output = self.model.predict_eps_from_z_and_v(xs, cond_t, output)
145
+ else:
146
+ output = output
147
+
148
+ Ds = unscaled_xs - σ * output
149
+ return Ds
150
+
151
+ def cond_info(self, batch_size):
152
+ prompts = batch_size * [self.prompt]
153
+ return self.prompts_emb(prompts)
154
+
155
+ @torch.no_grad()
156
+ def prompts_emb(self, prompts):
157
+ assert isinstance(prompts, list)
158
+ batch_size = len(prompts)
159
+ with self.precision_scope("cuda"):
160
+ with self.model.ema_scope():
161
+ cond = {}
162
+ c = self.cond_func(prompts)
163
+ cond['c'] = c
164
+ uc = None
165
+ if self.scale != 1.0:
166
+ uc = self.cond_func(batch_size * [""])
167
+ cond['uc'] = uc
168
+ return cond
169
+
170
+ def unet_is_cond(self):
171
+ return True
172
+
173
+ def use_cls_guidance(self):
174
+ return False
175
+
176
+ def snap_t_to_nearest_tick(self, t):
177
+ j = np.abs(t - self.us).argmin()
178
+ return self.us[j], j
179
+
180
+ def time_cond_vec(self, N, σ):
181
+ if isinstance(σ, float):
182
+ σ, j = self.snap_t_to_nearest_tick(σ) # σ might change due to snapping
183
+ cond_t = (self.M - 1) - j
184
+ cond_t = torch.tensor([cond_t] * N, device=self.device)
185
+ return cond_t, σ
186
+ else:
187
+ assert isinstance(σ, torch.Tensor)
188
+ σ = σ.reshape(-1).cpu().numpy()
189
+ σs = []
190
+ js = []
191
+ for elem in σ:
192
+ _σ, _j = self.snap_t_to_nearest_tick(elem)
193
+ σs.append(_σ)
194
+ js.append((self.M - 1) - _j)
195
+
196
+ cond_t = torch.tensor(js, device=self.device)
197
+ σs = torch.tensor(σs, device=self.device, dtype=torch.float32).reshape(-1, 1, 1, 1)
198
+ return cond_t, σs
199
+
200
+ @staticmethod
201
+ def linear_us(M=1000):
202
+ assert M == 1000
203
+ β_start = 0.00085
204
+ β_end = 0.0120
205
+ βs = np.linspace(β_start**0.5, β_end**0.5, M, dtype=np.float64)**2
206
+ αs = np.cumprod(1 - βs)
207
+ us = np.sqrt((1 - αs) / αs)
208
+ us = us[::-1]
209
+ return us
210
+
211
+ @torch.no_grad()
212
+ def encode(self, xs):
213
+ model = self.model
214
+ with self.precision_scope("cuda"):
215
+ with self.model.ema_scope():
216
+ zs = model.get_first_stage_encoding(
217
+ model.encode_first_stage(xs)
218
+ )
219
+ return zs
220
+
221
+ @torch.no_grad()
222
+ def decode(self, xs):
223
+ with self.precision_scope("cuda"):
224
+ with self.model.ema_scope():
225
+ xs = self.model.decode_first_stage(xs)
226
+ return xs
227
+
228
+
229
+ def test():
230
+ sd = StableDiffusion("v2", True, "haha", 10.0, "autocast")
231
+ print(sd)
232
+
233
+
234
+ if __name__ == "__main__":
235
+ test()
adapt_vesde.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import torch
3
+ from ml_collections.config_flags import config_flags
4
+
5
+ from sde.config import get_config
6
+ from sde import ddpm, ncsnv2, ncsnpp # need to import to trigger its registry
7
+ from sde import utils as mutils
8
+ from sde.ema import ExponentialMovingAverage
9
+
10
+ from adapt import ScoreAdapter
11
+
12
+ device = torch.device("cuda")
13
+
14
+
15
+ def restore_checkpoint(ckpt_dir, state, device):
16
+ loaded_state = torch.load(ckpt_dir, map_location=device)
17
+ # state['optimizer'].load_state_dict(loaded_state['optimizer'])
18
+ state['model'].load_state_dict(loaded_state['model'], strict=False)
19
+ state['ema'].load_state_dict(loaded_state['ema'])
20
+ state['step'] = loaded_state['step']
21
+ return state
22
+
23
+
24
+ def save_checkpoint(ckpt_dir, state):
25
+ saved_state = {
26
+ 'optimizer': state['optimizer'].state_dict(),
27
+ 'model': state['model'].state_dict(),
28
+ 'ema': state['ema'].state_dict(),
29
+ 'step': state['step']
30
+ }
31
+ torch.save(saved_state, ckpt_dir)
32
+
33
+
34
+ class VESDE(ScoreAdapter):
35
+ def __init__(self):
36
+ config = get_config()
37
+ config.device = device
38
+ ckpt_fname = self.checkpoint_root() / "sde" / 'checkpoint_127.pth'
39
+
40
+ score_model = mutils.create_model(config)
41
+ ema = ExponentialMovingAverage(
42
+ score_model.parameters(), decay=config.model.ema_rate
43
+ )
44
+ state = dict(model=score_model, ema=ema, step=0)
45
+ self._data_shape = (
46
+ config.data.num_channels, config.data.image_size, config.data.image_size
47
+ )
48
+
49
+ self._σ_min = float(config.model.sigma_min * 2)
50
+
51
+ state = restore_checkpoint(ckpt_fname, state, device=config.device)
52
+ ema.copy_to(score_model.parameters())
53
+
54
+ score_model.eval()
55
+ score_model = score_model.module # remove DataParallel
56
+
57
+ self.model = score_model
58
+ self._device = device
59
+
60
+ def data_shape(self):
61
+ return self._data_shape
62
+
63
+ @property
64
+ def σ_min(self):
65
+ return self._σ_min
66
+
67
+ @torch.no_grad()
68
+ def denoise(self, xs, σ):
69
+ N = xs.shape[0]
70
+ # see Karras eqn. 212-215 for the 1/2 σ correction
71
+ cond_t = (0.5 * σ) * torch.ones(N, device=self.device)
72
+ # note that the forward function the model has been modified; see comments
73
+ n_hat = self.model(xs, cond_t)
74
+ Ds = xs + σ * n_hat
75
+ return Ds
76
+
77
+ def unet_is_cond(self):
78
+ return False
79
+
80
+ def use_cls_guidance(self):
81
+ return False
82
+
83
+ def snap_t_to_nearest_tick(self, t):
84
+ return super().snap_t_to_nearest_tick(t)
app.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import time
3
+ from pathlib import Path
4
+ import torch
5
+ import imageio
6
+
7
+ from my.utils import tqdm
8
+ from my.utils.seed import seed_everything
9
+
10
+ from run_img_sampling import SD, StableDiffusion
11
+ from misc import torch_samps_to_imgs
12
+ from pose import PoseConfig
13
+
14
+ from run_nerf import VoxConfig
15
+ from voxnerf.utils import every
16
+ from voxnerf.vis import stitch_vis, bad_vis as nerf_vis
17
+
18
+ from run_sjc import render_one_view, tsr_stats
19
+ from highres_final_vis import highres_render_one_view
20
+
21
+ import gradio as gr
22
+ import gc
23
+ import os
24
+
25
+ device_glb = torch.device("cuda")
26
+
27
+ def vis_routine(y, depth):
28
+ pane = nerf_vis(y, depth, final_H=256)
29
+ im = torch_samps_to_imgs(y)[0]
30
+ depth = depth.cpu().numpy()
31
+ return pane, im, depth
32
+
33
+ css = '''
34
+ .instruction{position: absolute; top: 0;right: 0;margin-top: 0px !important}
35
+ .arrow{position: absolute;top: 0;right: -110px;margin-top: -8px !important}
36
+ #component-4, #component-3, #component-10{min-height: 0}
37
+ .duplicate-button img{margin: 0}
38
+ '''
39
+
40
+ with gr.Blocks(css=css) as demo:
41
+ # title
42
+ gr.Markdown('# [Score Jacobian Chaining](https://github.com/pals-ttic/sjc): Lifting Pretrained 2D Diffusion Models for 3D Generation')
43
+
44
+ gr.HTML(f'''
45
+ <div class="gr-prose" style="max-width: 80%">
46
+ <h2>Attention - This Space takes over 30min to run!</h2>
47
+ <p>If the Queue is too long you can run locally or duplicate the Space and run it on your own profile using a (paid) private T4 GPU for training. As each T4 costs US$0.60/h, it should cost < US$1 to train most models using default settings!&nbsp;&nbsp;<a style='display:inline-block' href='https://huggingface.co/spaces/MirageML/sjc?duplicate=true'><img src='https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14' alt='Duplicate Space'></a></p>
48
+ </div>
49
+ ''')
50
+ # inputs
51
+ prompt = gr.Textbox(label="Prompt", max_lines=1, value="A high quality photo of a delicious burger")
52
+ iters = gr.Slider(label="Iters", minimum=100, maximum=20000, value=10000, step=100)
53
+ seed = gr.Slider(label="Seed", minimum=0, maximum=2147483647, step=1, randomize=True)
54
+ button = gr.Button('Generate')
55
+
56
+ # outputs
57
+ image = gr.Image(label="image", visible=True)
58
+ # depth = gr.Image(label="depth", visible=True)
59
+ video = gr.Video(label="video", visible=False)
60
+ logs = gr.Textbox(label="logging")
61
+
62
+ def submit(prompt, iters, seed):
63
+ start_t = time.time()
64
+ seed_everything(seed)
65
+ # cfgs = {'gddpm': {'model': 'm_lsun_256', 'lsun_cat': 'bedroom', 'imgnet_cat': -1}, 'sd': {'variant': 'v1', 'v2_highres': False, 'prompt': 'A high quality photo of a delicious burger', 'scale': 100.0, 'precision': 'autocast'}, 'lr': 0.05, 'n_steps': 10000, 'emptiness_scale': 10, 'emptiness_weight': 10000, 'emptiness_step': 0.5, 'emptiness_multiplier': 20.0, 'depth_weight': 0, 'var_red': True}
66
+ pose = PoseConfig(rend_hw=64, FoV=60.0, R=1.5)
67
+ poser = pose.make()
68
+ sd_model = SD(variant='v1', v2_highres=False, prompt=prompt, scale=100.0, precision='autocast')
69
+ model = sd_model.make()
70
+ vox = VoxConfig(
71
+ model_type="V_SD", grid_size=100, density_shift=-1.0, c=4,
72
+ blend_bg_texture=True, bg_texture_hw=4,
73
+ bbox_len=1.0)
74
+ vox = vox.make()
75
+
76
+ lr = 0.05
77
+ n_steps = iters
78
+ emptiness_scale = 10
79
+ emptiness_weight = 10000
80
+ emptiness_step = 0.5
81
+ emptiness_multiplier = 20.0
82
+ depth_weight = 0
83
+ var_red = True
84
+
85
+ assert model.samps_centered()
86
+ _, target_H, target_W = model.data_shape()
87
+ bs = 1
88
+ aabb = vox.aabb.T.cpu().numpy()
89
+ vox = vox.to(device_glb)
90
+ opt = torch.optim.Adamax(vox.opt_params(), lr=lr)
91
+
92
+ H, W = poser.H, poser.W
93
+ Ks, poses, prompt_prefixes = poser.sample_train(n_steps)
94
+
95
+ ts = model.us[30:-10]
96
+
97
+ same_noise = torch.randn(1, 4, H, W, device=model.device).repeat(bs, 1, 1, 1)
98
+
99
+ with tqdm(total=n_steps) as pbar:
100
+ for i in range(n_steps):
101
+
102
+ p = f"{prompt_prefixes[i]} {model.prompt}"
103
+ score_conds = model.prompts_emb([p])
104
+
105
+ y, depth, ws = render_one_view(vox, aabb, H, W, Ks[i], poses[i], return_w=True)
106
+
107
+ if isinstance(model, StableDiffusion):
108
+ pass
109
+ else:
110
+ y = torch.nn.functional.interpolate(y, (target_H, target_W), mode='bilinear')
111
+
112
+ opt.zero_grad()
113
+
114
+ with torch.no_grad():
115
+ chosen_σs = np.random.choice(ts, bs, replace=False)
116
+ chosen_σs = chosen_σs.reshape(-1, 1, 1, 1)
117
+ chosen_σs = torch.as_tensor(chosen_σs, device=model.device, dtype=torch.float32)
118
+ # chosen_σs = us[i]
119
+
120
+ noise = torch.randn(bs, *y.shape[1:], device=model.device)
121
+
122
+ zs = y + chosen_σs * noise
123
+ Ds = model.denoise(zs, chosen_σs, **score_conds)
124
+
125
+ if var_red:
126
+ grad = (Ds - y) / chosen_σs
127
+ else:
128
+ grad = (Ds - zs) / chosen_σs
129
+
130
+ grad = grad.mean(0, keepdim=True)
131
+
132
+ y.backward(-grad, retain_graph=True)
133
+
134
+ if depth_weight > 0:
135
+ center_depth = depth[7:-7, 7:-7]
136
+ border_depth_mean = (depth.sum() - center_depth.sum()) / (64*64-50*50)
137
+ center_depth_mean = center_depth.mean()
138
+ depth_diff = center_depth_mean - border_depth_mean
139
+ depth_loss = - torch.log(depth_diff + 1e-12)
140
+ depth_loss = depth_weight * depth_loss
141
+ depth_loss.backward(retain_graph=True)
142
+
143
+ emptiness_loss = torch.log(1 + emptiness_scale * ws).mean()
144
+ emptiness_loss = emptiness_weight * emptiness_loss
145
+ if emptiness_step * n_steps <= i:
146
+ emptiness_loss *= emptiness_multiplier
147
+ emptiness_loss.backward()
148
+
149
+ opt.step()
150
+
151
+
152
+ # metric.put_scalars()
153
+
154
+ with torch.no_grad():
155
+ if isinstance(model, StableDiffusion):
156
+ y = model.decode(y)
157
+ pane, img, depth = vis_routine(y, depth)
158
+ yield {
159
+ image: gr.update(value=img, visible=True),
160
+ video: gr.update(visible=False),
161
+ logs: f"Steps: {i}/{n_steps}: \n" + str(tsr_stats(y)),
162
+ }
163
+
164
+ # TODO: Output pane, img and depth to Gradio
165
+
166
+ pbar.update()
167
+ pbar.set_description(p)
168
+
169
+ # TODO: Save Checkpoint
170
+ with torch.no_grad():
171
+ n_frames=200
172
+ factor=4
173
+ ckpt = vox.state_dict()
174
+ H, W = poser.H, poser.W
175
+ vox.eval()
176
+ K, poses = poser.sample_test(n_frames)
177
+ del n_frames
178
+ poses = poses[60:] # skip the full overhead view; not interesting
179
+
180
+ aabb = vox.aabb.T.cpu().numpy()
181
+ vox = vox.to(device_glb)
182
+
183
+ num_imgs = len(poses)
184
+ all_images = []
185
+
186
+ for i in (pbar := tqdm(range(num_imgs))):
187
+
188
+ pose = poses[i]
189
+ y, depth = highres_render_one_view(vox, aabb, H, W, K, pose, f=factor)
190
+ if isinstance(model, StableDiffusion):
191
+ y = model.decode(y)
192
+ pane, img, depth = vis_routine(y, depth)
193
+
194
+ # Save img to output
195
+ all_images.append(img)
196
+
197
+ yield {
198
+ image: gr.update(value=img, visible=True),
199
+ video: gr.update(visible=False),
200
+ logs: str(tsr_stats(y)),
201
+ }
202
+
203
+ output_video = "/tmp/tmp.mp4"
204
+
205
+ imageio.mimwrite(output_video, all_images, quality=8, fps=10)
206
+
207
+ end_t = time.time()
208
+
209
+ yield {
210
+ image: gr.update(value=img, visible=False),
211
+ video: gr.update(value=output_video, visible=True),
212
+ logs: f"Generation Finished in {(end_t - start_t)/ 60:.4f} minutes!",
213
+ }
214
+
215
+ button.click(
216
+ submit,
217
+ [prompt, iters, seed],
218
+ [image, video, logs]
219
+ )
220
+
221
+ # concurrency_count: only allow ONE running progress, else GPU will OOM.
222
+ demo.queue(concurrency_count=1)
223
+ demo.launch()
env.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "data_root": "release"
3
+ }
guided_diffusion/README.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Selected modules from OpenAI's [guided diffusion](https://github.com/openai/guided-diffusion), retrieved at commit `22e0df8183507e13a7813f8d38d51b072ca1e67c`
2
+
3
+ It's a bare minimum set of files needed to run their pretrained models. You can download these model checkpoints following the instructions in their repository README
4
+
5
+ Some modifications are made to remove the distributed processing utilities in order to reduce code complexity.
guided_diffusion/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Codebase for "Improved Denoising Diffusion Probabilistic Models".
3
+ """
guided_diffusion/fp16_util.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helpers to train with 16-bit precision.
3
+ """
4
+
5
+ import numpy as np
6
+ import torch as th
7
+ import torch.nn as nn
8
+ from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
9
+
10
+ # from . import logger
11
+
12
+ INITIAL_LOG_LOSS_SCALE = 20.0
13
+
14
+
15
+ def convert_module_to_f16(l):
16
+ """
17
+ Convert primitive modules to float16.
18
+ """
19
+ if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
20
+ l.weight.data = l.weight.data.half()
21
+ if l.bias is not None:
22
+ l.bias.data = l.bias.data.half()
23
+
24
+
25
+ def convert_module_to_f32(l):
26
+ """
27
+ Convert primitive modules to float32, undoing convert_module_to_f16().
28
+ """
29
+ if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
30
+ l.weight.data = l.weight.data.float()
31
+ if l.bias is not None:
32
+ l.bias.data = l.bias.data.float()
33
+
34
+
35
+ def make_master_params(param_groups_and_shapes):
36
+ """
37
+ Copy model parameters into a (differently-shaped) list of full-precision
38
+ parameters.
39
+ """
40
+ master_params = []
41
+ for param_group, shape in param_groups_and_shapes:
42
+ master_param = nn.Parameter(
43
+ _flatten_dense_tensors(
44
+ [param.detach().float() for (_, param) in param_group]
45
+ ).view(shape)
46
+ )
47
+ master_param.requires_grad = True
48
+ master_params.append(master_param)
49
+ return master_params
50
+
51
+
52
+ def model_grads_to_master_grads(param_groups_and_shapes, master_params):
53
+ """
54
+ Copy the gradients from the model parameters into the master parameters
55
+ from make_master_params().
56
+ """
57
+ for master_param, (param_group, shape) in zip(
58
+ master_params, param_groups_and_shapes
59
+ ):
60
+ master_param.grad = _flatten_dense_tensors(
61
+ [param_grad_or_zeros(param) for (_, param) in param_group]
62
+ ).view(shape)
63
+
64
+
65
+ def master_params_to_model_params(param_groups_and_shapes, master_params):
66
+ """
67
+ Copy the master parameter data back into the model parameters.
68
+ """
69
+ # Without copying to a list, if a generator is passed, this will
70
+ # silently not copy any parameters.
71
+ for master_param, (param_group, _) in zip(master_params, param_groups_and_shapes):
72
+ for (_, param), unflat_master_param in zip(
73
+ param_group, unflatten_master_params(param_group, master_param.view(-1))
74
+ ):
75
+ param.detach().copy_(unflat_master_param)
76
+
77
+
78
+ def unflatten_master_params(param_group, master_param):
79
+ return _unflatten_dense_tensors(master_param, [param for (_, param) in param_group])
80
+
81
+
82
+ def get_param_groups_and_shapes(named_model_params):
83
+ named_model_params = list(named_model_params)
84
+ scalar_vector_named_params = (
85
+ [(n, p) for (n, p) in named_model_params if p.ndim <= 1],
86
+ (-1),
87
+ )
88
+ matrix_named_params = (
89
+ [(n, p) for (n, p) in named_model_params if p.ndim > 1],
90
+ (1, -1),
91
+ )
92
+ return [scalar_vector_named_params, matrix_named_params]
93
+
94
+
95
+ def master_params_to_state_dict(
96
+ model, param_groups_and_shapes, master_params, use_fp16
97
+ ):
98
+ if use_fp16:
99
+ state_dict = model.state_dict()
100
+ for master_param, (param_group, _) in zip(
101
+ master_params, param_groups_and_shapes
102
+ ):
103
+ for (name, _), unflat_master_param in zip(
104
+ param_group, unflatten_master_params(param_group, master_param.view(-1))
105
+ ):
106
+ assert name in state_dict
107
+ state_dict[name] = unflat_master_param
108
+ else:
109
+ state_dict = model.state_dict()
110
+ for i, (name, _value) in enumerate(model.named_parameters()):
111
+ assert name in state_dict
112
+ state_dict[name] = master_params[i]
113
+ return state_dict
114
+
115
+
116
+ def state_dict_to_master_params(model, state_dict, use_fp16):
117
+ if use_fp16:
118
+ named_model_params = [
119
+ (name, state_dict[name]) for name, _ in model.named_parameters()
120
+ ]
121
+ param_groups_and_shapes = get_param_groups_and_shapes(named_model_params)
122
+ master_params = make_master_params(param_groups_and_shapes)
123
+ else:
124
+ master_params = [state_dict[name] for name, _ in model.named_parameters()]
125
+ return master_params
126
+
127
+
128
+ def zero_master_grads(master_params):
129
+ for param in master_params:
130
+ param.grad = None
131
+
132
+
133
+ def zero_grad(model_params):
134
+ for param in model_params:
135
+ # Taken from https://pytorch.org/docs/stable/_modules/torch/optim/optimizer.html#Optimizer.add_param_group
136
+ if param.grad is not None:
137
+ param.grad.detach_()
138
+ param.grad.zero_()
139
+
140
+
141
+ def param_grad_or_zeros(param):
142
+ if param.grad is not None:
143
+ return param.grad.data.detach()
144
+ else:
145
+ return th.zeros_like(param)
146
+
147
+
148
+ class MixedPrecisionTrainer:
149
+ def __init__(
150
+ self,
151
+ *,
152
+ model,
153
+ use_fp16=False,
154
+ fp16_scale_growth=1e-3,
155
+ initial_lg_loss_scale=INITIAL_LOG_LOSS_SCALE,
156
+ ):
157
+ self.model = model
158
+ self.use_fp16 = use_fp16
159
+ self.fp16_scale_growth = fp16_scale_growth
160
+
161
+ self.model_params = list(self.model.parameters())
162
+ self.master_params = self.model_params
163
+ self.param_groups_and_shapes = None
164
+ self.lg_loss_scale = initial_lg_loss_scale
165
+
166
+ if self.use_fp16:
167
+ self.param_groups_and_shapes = get_param_groups_and_shapes(
168
+ self.model.named_parameters()
169
+ )
170
+ self.master_params = make_master_params(self.param_groups_and_shapes)
171
+ self.model.convert_to_fp16()
172
+
173
+ def zero_grad(self):
174
+ zero_grad(self.model_params)
175
+
176
+ def backward(self, loss: th.Tensor):
177
+ if self.use_fp16:
178
+ loss_scale = 2 ** self.lg_loss_scale
179
+ (loss * loss_scale).backward()
180
+ else:
181
+ loss.backward()
182
+
183
+ def optimize(self, opt: th.optim.Optimizer):
184
+ if self.use_fp16:
185
+ return self._optimize_fp16(opt)
186
+ else:
187
+ return self._optimize_normal(opt)
188
+
189
+ def _optimize_fp16(self, opt: th.optim.Optimizer):
190
+ logger.logkv_mean("lg_loss_scale", self.lg_loss_scale)
191
+ model_grads_to_master_grads(self.param_groups_and_shapes, self.master_params)
192
+ grad_norm, param_norm = self._compute_norms(grad_scale=2 ** self.lg_loss_scale)
193
+ if check_overflow(grad_norm):
194
+ self.lg_loss_scale -= 1
195
+ logger.log(f"Found NaN, decreased lg_loss_scale to {self.lg_loss_scale}")
196
+ zero_master_grads(self.master_params)
197
+ return False
198
+
199
+ logger.logkv_mean("grad_norm", grad_norm)
200
+ logger.logkv_mean("param_norm", param_norm)
201
+
202
+ for p in self.master_params:
203
+ p.grad.mul_(1.0 / (2 ** self.lg_loss_scale))
204
+ opt.step()
205
+ zero_master_grads(self.master_params)
206
+ master_params_to_model_params(self.param_groups_and_shapes, self.master_params)
207
+ self.lg_loss_scale += self.fp16_scale_growth
208
+ return True
209
+
210
+ def _optimize_normal(self, opt: th.optim.Optimizer):
211
+ grad_norm, param_norm = self._compute_norms()
212
+ logger.logkv_mean("grad_norm", grad_norm)
213
+ logger.logkv_mean("param_norm", param_norm)
214
+ opt.step()
215
+ return True
216
+
217
+ def _compute_norms(self, grad_scale=1.0):
218
+ grad_norm = 0.0
219
+ param_norm = 0.0
220
+ for p in self.master_params:
221
+ with th.no_grad():
222
+ param_norm += th.norm(p, p=2, dtype=th.float32).item() ** 2
223
+ if p.grad is not None:
224
+ grad_norm += th.norm(p.grad, p=2, dtype=th.float32).item() ** 2
225
+ return np.sqrt(grad_norm) / grad_scale, np.sqrt(param_norm)
226
+
227
+ def master_params_to_state_dict(self, master_params):
228
+ return master_params_to_state_dict(
229
+ self.model, self.param_groups_and_shapes, master_params, self.use_fp16
230
+ )
231
+
232
+ def state_dict_to_master_params(self, state_dict):
233
+ return state_dict_to_master_params(self.model, state_dict, self.use_fp16)
234
+
235
+
236
+ def check_overflow(value):
237
+ return (value == float("inf")) or (value == -float("inf")) or (value != value)
guided_diffusion/gaussian_diffusion.py ADDED
@@ -0,0 +1,908 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This code started out as a PyTorch port of Ho et al's diffusion models:
3
+ https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py
4
+
5
+ Docstrings have been added, as well as DDIM sampling and a new collection of beta schedules.
6
+ """
7
+
8
+ import enum
9
+ import math
10
+
11
+ import numpy as np
12
+ import torch as th
13
+
14
+ from .nn import mean_flat
15
+ from .losses import normal_kl, discretized_gaussian_log_likelihood
16
+
17
+
18
+ def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
19
+ """
20
+ Get a pre-defined beta schedule for the given name.
21
+
22
+ The beta schedule library consists of beta schedules which remain similar
23
+ in the limit of num_diffusion_timesteps.
24
+ Beta schedules may be added, but should not be removed or changed once
25
+ they are committed to maintain backwards compatibility.
26
+ """
27
+ if schedule_name == "linear":
28
+ # Linear schedule from Ho et al, extended to work for any number of
29
+ # diffusion steps.
30
+ scale = 1000 / num_diffusion_timesteps
31
+ beta_start = scale * 0.0001
32
+ beta_end = scale * 0.02
33
+ return np.linspace(
34
+ beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64
35
+ )
36
+ elif schedule_name == "cosine":
37
+ return betas_for_alpha_bar(
38
+ num_diffusion_timesteps,
39
+ lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
40
+ )
41
+ else:
42
+ raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
43
+
44
+
45
+ def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
46
+ """
47
+ Create a beta schedule that discretizes the given alpha_t_bar function,
48
+ which defines the cumulative product of (1-beta) over time from t = [0,1].
49
+
50
+ :param num_diffusion_timesteps: the number of betas to produce.
51
+ :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
52
+ produces the cumulative product of (1-beta) up to that
53
+ part of the diffusion process.
54
+ :param max_beta: the maximum beta to use; use values lower than 1 to
55
+ prevent singularities.
56
+ """
57
+ betas = []
58
+ for i in range(num_diffusion_timesteps):
59
+ t1 = i / num_diffusion_timesteps
60
+ t2 = (i + 1) / num_diffusion_timesteps
61
+ betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
62
+ return np.array(betas)
63
+
64
+
65
+ class ModelMeanType(enum.Enum):
66
+ """
67
+ Which type of output the model predicts.
68
+ """
69
+
70
+ PREVIOUS_X = enum.auto() # the model predicts x_{t-1}
71
+ START_X = enum.auto() # the model predicts x_0
72
+ EPSILON = enum.auto() # the model predicts epsilon
73
+
74
+
75
+ class ModelVarType(enum.Enum):
76
+ """
77
+ What is used as the model's output variance.
78
+
79
+ The LEARNED_RANGE option has been added to allow the model to predict
80
+ values between FIXED_SMALL and FIXED_LARGE, making its job easier.
81
+ """
82
+
83
+ LEARNED = enum.auto()
84
+ FIXED_SMALL = enum.auto()
85
+ FIXED_LARGE = enum.auto()
86
+ LEARNED_RANGE = enum.auto()
87
+
88
+
89
+ class LossType(enum.Enum):
90
+ MSE = enum.auto() # use raw MSE loss (and KL when learning variances)
91
+ RESCALED_MSE = (
92
+ enum.auto()
93
+ ) # use raw MSE loss (with RESCALED_KL when learning variances)
94
+ KL = enum.auto() # use the variational lower-bound
95
+ RESCALED_KL = enum.auto() # like KL, but rescale to estimate the full VLB
96
+
97
+ def is_vb(self):
98
+ return self == LossType.KL or self == LossType.RESCALED_KL
99
+
100
+
101
+ class GaussianDiffusion:
102
+ """
103
+ Utilities for training and sampling diffusion models.
104
+
105
+ Ported directly from here, and then adapted over time to further experimentation.
106
+ https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
107
+
108
+ :param betas: a 1-D numpy array of betas for each diffusion timestep,
109
+ starting at T and going to 1.
110
+ :param model_mean_type: a ModelMeanType determining what the model outputs.
111
+ :param model_var_type: a ModelVarType determining how variance is output.
112
+ :param loss_type: a LossType determining the loss function to use.
113
+ :param rescale_timesteps: if True, pass floating point timesteps into the
114
+ model so that they are always scaled like in the
115
+ original paper (0 to 1000).
116
+ """
117
+
118
+ def __init__(
119
+ self,
120
+ *,
121
+ betas,
122
+ model_mean_type,
123
+ model_var_type,
124
+ loss_type,
125
+ rescale_timesteps=False,
126
+ ):
127
+ self.model_mean_type = model_mean_type
128
+ self.model_var_type = model_var_type
129
+ self.loss_type = loss_type
130
+ self.rescale_timesteps = rescale_timesteps
131
+
132
+ # Use float64 for accuracy.
133
+ betas = np.array(betas, dtype=np.float64)
134
+ self.betas = betas
135
+ assert len(betas.shape) == 1, "betas must be 1-D"
136
+ assert (betas > 0).all() and (betas <= 1).all()
137
+
138
+ self.num_timesteps = int(betas.shape[0])
139
+
140
+ alphas = 1.0 - betas
141
+ self.alphas_cumprod = np.cumprod(alphas, axis=0)
142
+ self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
143
+ self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
144
+ assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
145
+
146
+ # calculations for diffusion q(x_t | x_{t-1}) and others
147
+ self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
148
+ self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
149
+ self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
150
+ self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
151
+ self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
152
+
153
+ # calculations for posterior q(x_{t-1} | x_t, x_0)
154
+ self.posterior_variance = (
155
+ betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
156
+ )
157
+ # log calculation clipped because the posterior variance is 0 at the
158
+ # beginning of the diffusion chain.
159
+ self.posterior_log_variance_clipped = np.log(
160
+ np.append(self.posterior_variance[1], self.posterior_variance[1:])
161
+ )
162
+ self.posterior_mean_coef1 = (
163
+ betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
164
+ )
165
+ self.posterior_mean_coef2 = (
166
+ (1.0 - self.alphas_cumprod_prev)
167
+ * np.sqrt(alphas)
168
+ / (1.0 - self.alphas_cumprod)
169
+ )
170
+
171
+ def q_mean_variance(self, x_start, t):
172
+ """
173
+ Get the distribution q(x_t | x_0).
174
+
175
+ :param x_start: the [N x C x ...] tensor of noiseless inputs.
176
+ :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
177
+ :return: A tuple (mean, variance, log_variance), all of x_start's shape.
178
+ """
179
+ mean = (
180
+ _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
181
+ )
182
+ variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
183
+ log_variance = _extract_into_tensor(
184
+ self.log_one_minus_alphas_cumprod, t, x_start.shape
185
+ )
186
+ return mean, variance, log_variance
187
+
188
+ def q_sample(self, x_start, t, noise=None):
189
+ """
190
+ Diffuse the data for a given number of diffusion steps.
191
+
192
+ In other words, sample from q(x_t | x_0).
193
+
194
+ :param x_start: the initial data batch.
195
+ :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
196
+ :param noise: if specified, the split-out normal noise.
197
+ :return: A noisy version of x_start.
198
+ """
199
+ if noise is None:
200
+ noise = th.randn_like(x_start)
201
+ assert noise.shape == x_start.shape
202
+ return (
203
+ _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
204
+ + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape)
205
+ * noise
206
+ )
207
+
208
+ def q_posterior_mean_variance(self, x_start, x_t, t):
209
+ """
210
+ Compute the mean and variance of the diffusion posterior:
211
+
212
+ q(x_{t-1} | x_t, x_0)
213
+
214
+ """
215
+ assert x_start.shape == x_t.shape
216
+ posterior_mean = (
217
+ _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
218
+ + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
219
+ )
220
+ posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
221
+ posterior_log_variance_clipped = _extract_into_tensor(
222
+ self.posterior_log_variance_clipped, t, x_t.shape
223
+ )
224
+ assert (
225
+ posterior_mean.shape[0]
226
+ == posterior_variance.shape[0]
227
+ == posterior_log_variance_clipped.shape[0]
228
+ == x_start.shape[0]
229
+ )
230
+ return posterior_mean, posterior_variance, posterior_log_variance_clipped
231
+
232
+ def p_mean_variance(
233
+ self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None
234
+ ):
235
+ """
236
+ Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
237
+ the initial x, x_0.
238
+
239
+ :param model: the model, which takes a signal and a batch of timesteps
240
+ as input.
241
+ :param x: the [N x C x ...] tensor at time t.
242
+ :param t: a 1-D Tensor of timesteps.
243
+ :param clip_denoised: if True, clip the denoised signal into [-1, 1].
244
+ :param denoised_fn: if not None, a function which applies to the
245
+ x_start prediction before it is used to sample. Applies before
246
+ clip_denoised.
247
+ :param model_kwargs: if not None, a dict of extra keyword arguments to
248
+ pass to the model. This can be used for conditioning.
249
+ :return: a dict with the following keys:
250
+ - 'mean': the model mean output.
251
+ - 'variance': the model variance output.
252
+ - 'log_variance': the log of 'variance'.
253
+ - 'pred_xstart': the prediction for x_0.
254
+ """
255
+ if model_kwargs is None:
256
+ model_kwargs = {}
257
+
258
+ B, C = x.shape[:2]
259
+ assert t.shape == (B,)
260
+ model_output = model(x, self._scale_timesteps(t), **model_kwargs)
261
+
262
+ if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
263
+ assert model_output.shape == (B, C * 2, *x.shape[2:])
264
+ model_output, model_var_values = th.split(model_output, C, dim=1)
265
+ if self.model_var_type == ModelVarType.LEARNED:
266
+ model_log_variance = model_var_values
267
+ model_variance = th.exp(model_log_variance)
268
+ else:
269
+ min_log = _extract_into_tensor(
270
+ self.posterior_log_variance_clipped, t, x.shape
271
+ )
272
+ max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
273
+ # The model_var_values is [-1, 1] for [min_var, max_var].
274
+ frac = (model_var_values + 1) / 2
275
+ model_log_variance = frac * max_log + (1 - frac) * min_log
276
+ model_variance = th.exp(model_log_variance)
277
+ else:
278
+ model_variance, model_log_variance = {
279
+ # for fixedlarge, we set the initial (log-)variance like so
280
+ # to get a better decoder log likelihood.
281
+ ModelVarType.FIXED_LARGE: (
282
+ np.append(self.posterior_variance[1], self.betas[1:]),
283
+ np.log(np.append(self.posterior_variance[1], self.betas[1:])),
284
+ ),
285
+ ModelVarType.FIXED_SMALL: (
286
+ self.posterior_variance,
287
+ self.posterior_log_variance_clipped,
288
+ ),
289
+ }[self.model_var_type]
290
+ model_variance = _extract_into_tensor(model_variance, t, x.shape)
291
+ model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
292
+
293
+ def process_xstart(x):
294
+ if denoised_fn is not None:
295
+ x = denoised_fn(x)
296
+ if clip_denoised:
297
+ return x.clamp(-1, 1)
298
+ return x
299
+
300
+ if self.model_mean_type == ModelMeanType.PREVIOUS_X:
301
+ pred_xstart = process_xstart(
302
+ self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output)
303
+ )
304
+ model_mean = model_output
305
+ elif self.model_mean_type in [ModelMeanType.START_X, ModelMeanType.EPSILON]:
306
+ if self.model_mean_type == ModelMeanType.START_X:
307
+ pred_xstart = process_xstart(model_output)
308
+ else:
309
+ pred_xstart = process_xstart(
310
+ self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
311
+ )
312
+ model_mean, _, _ = self.q_posterior_mean_variance(
313
+ x_start=pred_xstart, x_t=x, t=t
314
+ )
315
+ else:
316
+ raise NotImplementedError(self.model_mean_type)
317
+
318
+ assert (
319
+ model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
320
+ )
321
+ return {
322
+ "mean": model_mean,
323
+ "variance": model_variance,
324
+ "log_variance": model_log_variance,
325
+ "pred_xstart": pred_xstart,
326
+ }
327
+
328
+ def _predict_xstart_from_eps(self, x_t, t, eps):
329
+ assert x_t.shape == eps.shape
330
+ return (
331
+ _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
332
+ - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
333
+ )
334
+
335
+ def _predict_xstart_from_xprev(self, x_t, t, xprev):
336
+ assert x_t.shape == xprev.shape
337
+ return ( # (xprev - coef2*x_t) / coef1
338
+ _extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev
339
+ - _extract_into_tensor(
340
+ self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape
341
+ )
342
+ * x_t
343
+ )
344
+
345
+ def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
346
+ return (
347
+ _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
348
+ - pred_xstart
349
+ ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
350
+
351
+ def _scale_timesteps(self, t):
352
+ if self.rescale_timesteps:
353
+ return t.float() * (1000.0 / self.num_timesteps)
354
+ return t
355
+
356
+ def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
357
+ """
358
+ Compute the mean for the previous step, given a function cond_fn that
359
+ computes the gradient of a conditional log probability with respect to
360
+ x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
361
+ condition on y.
362
+
363
+ This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
364
+ """
365
+ gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs)
366
+ new_mean = (
367
+ p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
368
+ )
369
+ return new_mean
370
+
371
+ def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
372
+ """
373
+ Compute what the p_mean_variance output would have been, should the
374
+ model's score function be conditioned by cond_fn.
375
+
376
+ See condition_mean() for details on cond_fn.
377
+
378
+ Unlike condition_mean(), this instead uses the conditioning strategy
379
+ from Song et al (2020).
380
+ """
381
+ alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
382
+
383
+ eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
384
+ eps = eps - (1 - alpha_bar).sqrt() * cond_fn(
385
+ x, self._scale_timesteps(t), **model_kwargs
386
+ )
387
+
388
+ out = p_mean_var.copy()
389
+ out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
390
+ out["mean"], _, _ = self.q_posterior_mean_variance(
391
+ x_start=out["pred_xstart"], x_t=x, t=t
392
+ )
393
+ return out
394
+
395
+ def p_sample(
396
+ self,
397
+ model,
398
+ x,
399
+ t,
400
+ clip_denoised=True,
401
+ denoised_fn=None,
402
+ cond_fn=None,
403
+ model_kwargs=None,
404
+ ):
405
+ """
406
+ Sample x_{t-1} from the model at the given timestep.
407
+
408
+ :param model: the model to sample from.
409
+ :param x: the current tensor at x_{t-1}.
410
+ :param t: the value of t, starting at 0 for the first diffusion step.
411
+ :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
412
+ :param denoised_fn: if not None, a function which applies to the
413
+ x_start prediction before it is used to sample.
414
+ :param cond_fn: if not None, this is a gradient function that acts
415
+ similarly to the model.
416
+ :param model_kwargs: if not None, a dict of extra keyword arguments to
417
+ pass to the model. This can be used for conditioning.
418
+ :return: a dict containing the following keys:
419
+ - 'sample': a random sample from the model.
420
+ - 'pred_xstart': a prediction of x_0.
421
+ """
422
+ out = self.p_mean_variance(
423
+ model,
424
+ x,
425
+ t,
426
+ clip_denoised=clip_denoised,
427
+ denoised_fn=denoised_fn,
428
+ model_kwargs=model_kwargs,
429
+ )
430
+ noise = th.randn_like(x)
431
+ nonzero_mask = (
432
+ (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
433
+ ) # no noise when t == 0
434
+ if cond_fn is not None:
435
+ out["mean"] = self.condition_mean(
436
+ cond_fn, out, x, t, model_kwargs=model_kwargs
437
+ )
438
+ sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
439
+ return {"sample": sample, "pred_xstart": out["pred_xstart"]}
440
+
441
+ def p_sample_loop(
442
+ self,
443
+ model,
444
+ shape,
445
+ noise=None,
446
+ clip_denoised=True,
447
+ denoised_fn=None,
448
+ cond_fn=None,
449
+ model_kwargs=None,
450
+ device=None,
451
+ progress=False,
452
+ ):
453
+ """
454
+ Generate samples from the model.
455
+
456
+ :param model: the model module.
457
+ :param shape: the shape of the samples, (N, C, H, W).
458
+ :param noise: if specified, the noise from the encoder to sample.
459
+ Should be of the same shape as `shape`.
460
+ :param clip_denoised: if True, clip x_start predictions to [-1, 1].
461
+ :param denoised_fn: if not None, a function which applies to the
462
+ x_start prediction before it is used to sample.
463
+ :param cond_fn: if not None, this is a gradient function that acts
464
+ similarly to the model.
465
+ :param model_kwargs: if not None, a dict of extra keyword arguments to
466
+ pass to the model. This can be used for conditioning.
467
+ :param device: if specified, the device to create the samples on.
468
+ If not specified, use a model parameter's device.
469
+ :param progress: if True, show a tqdm progress bar.
470
+ :return: a non-differentiable batch of samples.
471
+ """
472
+ final = None
473
+ for sample in self.p_sample_loop_progressive(
474
+ model,
475
+ shape,
476
+ noise=noise,
477
+ clip_denoised=clip_denoised,
478
+ denoised_fn=denoised_fn,
479
+ cond_fn=cond_fn,
480
+ model_kwargs=model_kwargs,
481
+ device=device,
482
+ progress=progress,
483
+ ):
484
+ final = sample
485
+ return final["sample"]
486
+
487
+ def p_sample_loop_progressive(
488
+ self,
489
+ model,
490
+ shape,
491
+ noise=None,
492
+ clip_denoised=True,
493
+ denoised_fn=None,
494
+ cond_fn=None,
495
+ model_kwargs=None,
496
+ device=None,
497
+ progress=False,
498
+ ):
499
+ """
500
+ Generate samples from the model and yield intermediate samples from
501
+ each timestep of diffusion.
502
+
503
+ Arguments are the same as p_sample_loop().
504
+ Returns a generator over dicts, where each dict is the return value of
505
+ p_sample().
506
+ """
507
+ if device is None:
508
+ device = next(model.parameters()).device
509
+ assert isinstance(shape, (tuple, list))
510
+ if noise is not None:
511
+ img = noise
512
+ else:
513
+ img = th.randn(*shape, device=device)
514
+ indices = list(range(self.num_timesteps))[::-1]
515
+
516
+ if progress:
517
+ # Lazy import so that we don't depend on tqdm.
518
+ from tqdm.auto import tqdm
519
+
520
+ indices = tqdm(indices)
521
+
522
+ for i in indices:
523
+ t = th.tensor([i] * shape[0], device=device)
524
+ with th.no_grad():
525
+ out = self.p_sample(
526
+ model,
527
+ img,
528
+ t,
529
+ clip_denoised=clip_denoised,
530
+ denoised_fn=denoised_fn,
531
+ cond_fn=cond_fn,
532
+ model_kwargs=model_kwargs,
533
+ )
534
+ yield out
535
+ img = out["sample"]
536
+
537
+ def ddim_sample(
538
+ self,
539
+ model,
540
+ x,
541
+ t,
542
+ clip_denoised=True,
543
+ denoised_fn=None,
544
+ cond_fn=None,
545
+ model_kwargs=None,
546
+ eta=0.0,
547
+ ):
548
+ """
549
+ Sample x_{t-1} from the model using DDIM.
550
+
551
+ Same usage as p_sample().
552
+ """
553
+ out = self.p_mean_variance(
554
+ model,
555
+ x,
556
+ t,
557
+ clip_denoised=clip_denoised,
558
+ denoised_fn=denoised_fn,
559
+ model_kwargs=model_kwargs,
560
+ )
561
+ if cond_fn is not None:
562
+ out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
563
+
564
+ # Usually our model outputs epsilon, but we re-derive it
565
+ # in case we used x_start or x_prev prediction.
566
+ eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
567
+
568
+ alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
569
+ alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
570
+ sigma = (
571
+ eta
572
+ * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
573
+ * th.sqrt(1 - alpha_bar / alpha_bar_prev)
574
+ )
575
+ # Equation 12.
576
+ noise = th.randn_like(x)
577
+ mean_pred = (
578
+ out["pred_xstart"] * th.sqrt(alpha_bar_prev)
579
+ + th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
580
+ )
581
+ nonzero_mask = (
582
+ (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
583
+ ) # no noise when t == 0
584
+ sample = mean_pred + nonzero_mask * sigma * noise
585
+ return {"sample": sample, "pred_xstart": out["pred_xstart"]}
586
+
587
+ def ddim_reverse_sample(
588
+ self,
589
+ model,
590
+ x,
591
+ t,
592
+ clip_denoised=True,
593
+ denoised_fn=None,
594
+ model_kwargs=None,
595
+ eta=0.0,
596
+ ):
597
+ """
598
+ Sample x_{t+1} from the model using DDIM reverse ODE.
599
+ """
600
+ assert eta == 0.0, "Reverse ODE only for deterministic path"
601
+ out = self.p_mean_variance(
602
+ model,
603
+ x,
604
+ t,
605
+ clip_denoised=clip_denoised,
606
+ denoised_fn=denoised_fn,
607
+ model_kwargs=model_kwargs,
608
+ )
609
+ # Usually our model outputs epsilon, but we re-derive it
610
+ # in case we used x_start or x_prev prediction.
611
+ eps = (
612
+ _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
613
+ - out["pred_xstart"]
614
+ ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
615
+ alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
616
+
617
+ # Equation 12. reversed
618
+ mean_pred = (
619
+ out["pred_xstart"] * th.sqrt(alpha_bar_next)
620
+ + th.sqrt(1 - alpha_bar_next) * eps
621
+ )
622
+
623
+ return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
624
+
625
+ def ddim_sample_loop(
626
+ self,
627
+ model,
628
+ shape,
629
+ noise=None,
630
+ clip_denoised=True,
631
+ denoised_fn=None,
632
+ cond_fn=None,
633
+ model_kwargs=None,
634
+ device=None,
635
+ progress=False,
636
+ eta=0.0,
637
+ ):
638
+ """
639
+ Generate samples from the model using DDIM.
640
+
641
+ Same usage as p_sample_loop().
642
+ """
643
+ final = None
644
+ for sample in self.ddim_sample_loop_progressive(
645
+ model,
646
+ shape,
647
+ noise=noise,
648
+ clip_denoised=clip_denoised,
649
+ denoised_fn=denoised_fn,
650
+ cond_fn=cond_fn,
651
+ model_kwargs=model_kwargs,
652
+ device=device,
653
+ progress=progress,
654
+ eta=eta,
655
+ ):
656
+ final = sample
657
+ return final["sample"]
658
+
659
+ def ddim_sample_loop_progressive(
660
+ self,
661
+ model,
662
+ shape,
663
+ noise=None,
664
+ clip_denoised=True,
665
+ denoised_fn=None,
666
+ cond_fn=None,
667
+ model_kwargs=None,
668
+ device=None,
669
+ progress=False,
670
+ eta=0.0,
671
+ ):
672
+ """
673
+ Use DDIM to sample from the model and yield intermediate samples from
674
+ each timestep of DDIM.
675
+
676
+ Same usage as p_sample_loop_progressive().
677
+ """
678
+ if device is None:
679
+ device = next(model.parameters()).device
680
+ assert isinstance(shape, (tuple, list))
681
+ if noise is not None:
682
+ img = noise
683
+ else:
684
+ img = th.randn(*shape, device=device)
685
+ indices = list(range(self.num_timesteps))[::-1]
686
+
687
+ if progress:
688
+ # Lazy import so that we don't depend on tqdm.
689
+ from tqdm.auto import tqdm
690
+
691
+ indices = tqdm(indices)
692
+
693
+ for i in indices:
694
+ t = th.tensor([i] * shape[0], device=device)
695
+ with th.no_grad():
696
+ out = self.ddim_sample(
697
+ model,
698
+ img,
699
+ t,
700
+ clip_denoised=clip_denoised,
701
+ denoised_fn=denoised_fn,
702
+ cond_fn=cond_fn,
703
+ model_kwargs=model_kwargs,
704
+ eta=eta,
705
+ )
706
+ yield out
707
+ img = out["sample"]
708
+
709
+ def _vb_terms_bpd(
710
+ self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None
711
+ ):
712
+ """
713
+ Get a term for the variational lower-bound.
714
+
715
+ The resulting units are bits (rather than nats, as one might expect).
716
+ This allows for comparison to other papers.
717
+
718
+ :return: a dict with the following keys:
719
+ - 'output': a shape [N] tensor of NLLs or KLs.
720
+ - 'pred_xstart': the x_0 predictions.
721
+ """
722
+ true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
723
+ x_start=x_start, x_t=x_t, t=t
724
+ )
725
+ out = self.p_mean_variance(
726
+ model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
727
+ )
728
+ kl = normal_kl(
729
+ true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
730
+ )
731
+ kl = mean_flat(kl) / np.log(2.0)
732
+
733
+ decoder_nll = -discretized_gaussian_log_likelihood(
734
+ x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
735
+ )
736
+ assert decoder_nll.shape == x_start.shape
737
+ decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
738
+
739
+ # At the first timestep return the decoder NLL,
740
+ # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
741
+ output = th.where((t == 0), decoder_nll, kl)
742
+ return {"output": output, "pred_xstart": out["pred_xstart"]}
743
+
744
+ def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
745
+ """
746
+ Compute training losses for a single timestep.
747
+
748
+ :param model: the model to evaluate loss on.
749
+ :param x_start: the [N x C x ...] tensor of inputs.
750
+ :param t: a batch of timestep indices.
751
+ :param model_kwargs: if not None, a dict of extra keyword arguments to
752
+ pass to the model. This can be used for conditioning.
753
+ :param noise: if specified, the specific Gaussian noise to try to remove.
754
+ :return: a dict with the key "loss" containing a tensor of shape [N].
755
+ Some mean or variance settings may also have other keys.
756
+ """
757
+ if model_kwargs is None:
758
+ model_kwargs = {}
759
+ if noise is None:
760
+ noise = th.randn_like(x_start)
761
+ x_t = self.q_sample(x_start, t, noise=noise)
762
+
763
+ terms = {}
764
+
765
+ if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
766
+ terms["loss"] = self._vb_terms_bpd(
767
+ model=model,
768
+ x_start=x_start,
769
+ x_t=x_t,
770
+ t=t,
771
+ clip_denoised=False,
772
+ model_kwargs=model_kwargs,
773
+ )["output"]
774
+ if self.loss_type == LossType.RESCALED_KL:
775
+ terms["loss"] *= self.num_timesteps
776
+ elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
777
+ model_output = model(x_t, self._scale_timesteps(t), **model_kwargs)
778
+
779
+ if self.model_var_type in [
780
+ ModelVarType.LEARNED,
781
+ ModelVarType.LEARNED_RANGE,
782
+ ]:
783
+ B, C = x_t.shape[:2]
784
+ assert model_output.shape == (B, C * 2, *x_t.shape[2:])
785
+ model_output, model_var_values = th.split(model_output, C, dim=1)
786
+ # Learn the variance using the variational bound, but don't let
787
+ # it affect our mean prediction.
788
+ frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
789
+ terms["vb"] = self._vb_terms_bpd(
790
+ model=lambda *args, r=frozen_out: r,
791
+ x_start=x_start,
792
+ x_t=x_t,
793
+ t=t,
794
+ clip_denoised=False,
795
+ )["output"]
796
+ if self.loss_type == LossType.RESCALED_MSE:
797
+ # Divide by 1000 for equivalence with initial implementation.
798
+ # Without a factor of 1/1000, the VB term hurts the MSE term.
799
+ terms["vb"] *= self.num_timesteps / 1000.0
800
+
801
+ target = {
802
+ ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
803
+ x_start=x_start, x_t=x_t, t=t
804
+ )[0],
805
+ ModelMeanType.START_X: x_start,
806
+ ModelMeanType.EPSILON: noise,
807
+ }[self.model_mean_type]
808
+ assert model_output.shape == target.shape == x_start.shape
809
+ terms["mse"] = mean_flat((target - model_output) ** 2)
810
+ if "vb" in terms:
811
+ terms["loss"] = terms["mse"] + terms["vb"]
812
+ else:
813
+ terms["loss"] = terms["mse"]
814
+ else:
815
+ raise NotImplementedError(self.loss_type)
816
+
817
+ return terms
818
+
819
+ def _prior_bpd(self, x_start):
820
+ """
821
+ Get the prior KL term for the variational lower-bound, measured in
822
+ bits-per-dim.
823
+
824
+ This term can't be optimized, as it only depends on the encoder.
825
+
826
+ :param x_start: the [N x C x ...] tensor of inputs.
827
+ :return: a batch of [N] KL values (in bits), one per batch element.
828
+ """
829
+ batch_size = x_start.shape[0]
830
+ t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
831
+ qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
832
+ kl_prior = normal_kl(
833
+ mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
834
+ )
835
+ return mean_flat(kl_prior) / np.log(2.0)
836
+
837
+ def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
838
+ """
839
+ Compute the entire variational lower-bound, measured in bits-per-dim,
840
+ as well as other related quantities.
841
+
842
+ :param model: the model to evaluate loss on.
843
+ :param x_start: the [N x C x ...] tensor of inputs.
844
+ :param clip_denoised: if True, clip denoised samples.
845
+ :param model_kwargs: if not None, a dict of extra keyword arguments to
846
+ pass to the model. This can be used for conditioning.
847
+
848
+ :return: a dict containing the following keys:
849
+ - total_bpd: the total variational lower-bound, per batch element.
850
+ - prior_bpd: the prior term in the lower-bound.
851
+ - vb: an [N x T] tensor of terms in the lower-bound.
852
+ - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
853
+ - mse: an [N x T] tensor of epsilon MSEs for each timestep.
854
+ """
855
+ device = x_start.device
856
+ batch_size = x_start.shape[0]
857
+
858
+ vb = []
859
+ xstart_mse = []
860
+ mse = []
861
+ for t in list(range(self.num_timesteps))[::-1]:
862
+ t_batch = th.tensor([t] * batch_size, device=device)
863
+ noise = th.randn_like(x_start)
864
+ x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
865
+ # Calculate VLB term at the current timestep
866
+ with th.no_grad():
867
+ out = self._vb_terms_bpd(
868
+ model,
869
+ x_start=x_start,
870
+ x_t=x_t,
871
+ t=t_batch,
872
+ clip_denoised=clip_denoised,
873
+ model_kwargs=model_kwargs,
874
+ )
875
+ vb.append(out["output"])
876
+ xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
877
+ eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
878
+ mse.append(mean_flat((eps - noise) ** 2))
879
+
880
+ vb = th.stack(vb, dim=1)
881
+ xstart_mse = th.stack(xstart_mse, dim=1)
882
+ mse = th.stack(mse, dim=1)
883
+
884
+ prior_bpd = self._prior_bpd(x_start)
885
+ total_bpd = vb.sum(dim=1) + prior_bpd
886
+ return {
887
+ "total_bpd": total_bpd,
888
+ "prior_bpd": prior_bpd,
889
+ "vb": vb,
890
+ "xstart_mse": xstart_mse,
891
+ "mse": mse,
892
+ }
893
+
894
+
895
+ def _extract_into_tensor(arr, timesteps, broadcast_shape):
896
+ """
897
+ Extract values from a 1-D numpy array for a batch of indices.
898
+
899
+ :param arr: the 1-D numpy array.
900
+ :param timesteps: a tensor of indices into the array to extract.
901
+ :param broadcast_shape: a larger shape of K dimensions with the batch
902
+ dimension equal to the length of timesteps.
903
+ :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
904
+ """
905
+ res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
906
+ while len(res.shape) < len(broadcast_shape):
907
+ res = res[..., None]
908
+ return res.expand(broadcast_shape)
guided_diffusion/losses.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helpers for various likelihood-based losses. These are ported from the original
3
+ Ho et al. diffusion models codebase:
4
+ https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/utils.py
5
+ """
6
+
7
+ import numpy as np
8
+
9
+ import torch as th
10
+
11
+
12
+ def normal_kl(mean1, logvar1, mean2, logvar2):
13
+ """
14
+ Compute the KL divergence between two gaussians.
15
+
16
+ Shapes are automatically broadcasted, so batches can be compared to
17
+ scalars, among other use cases.
18
+ """
19
+ tensor = None
20
+ for obj in (mean1, logvar1, mean2, logvar2):
21
+ if isinstance(obj, th.Tensor):
22
+ tensor = obj
23
+ break
24
+ assert tensor is not None, "at least one argument must be a Tensor"
25
+
26
+ # Force variances to be Tensors. Broadcasting helps convert scalars to
27
+ # Tensors, but it does not work for th.exp().
28
+ logvar1, logvar2 = [
29
+ x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
30
+ for x in (logvar1, logvar2)
31
+ ]
32
+
33
+ return 0.5 * (
34
+ -1.0
35
+ + logvar2
36
+ - logvar1
37
+ + th.exp(logvar1 - logvar2)
38
+ + ((mean1 - mean2) ** 2) * th.exp(-logvar2)
39
+ )
40
+
41
+
42
+ def approx_standard_normal_cdf(x):
43
+ """
44
+ A fast approximation of the cumulative distribution function of the
45
+ standard normal.
46
+ """
47
+ return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
48
+
49
+
50
+ def discretized_gaussian_log_likelihood(x, *, means, log_scales):
51
+ """
52
+ Compute the log-likelihood of a Gaussian distribution discretizing to a
53
+ given image.
54
+
55
+ :param x: the target images. It is assumed that this was uint8 values,
56
+ rescaled to the range [-1, 1].
57
+ :param means: the Gaussian mean Tensor.
58
+ :param log_scales: the Gaussian log stddev Tensor.
59
+ :return: a tensor like x of log probabilities (in nats).
60
+ """
61
+ assert x.shape == means.shape == log_scales.shape
62
+ centered_x = x - means
63
+ inv_stdv = th.exp(-log_scales)
64
+ plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
65
+ cdf_plus = approx_standard_normal_cdf(plus_in)
66
+ min_in = inv_stdv * (centered_x - 1.0 / 255.0)
67
+ cdf_min = approx_standard_normal_cdf(min_in)
68
+ log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
69
+ log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
70
+ cdf_delta = cdf_plus - cdf_min
71
+ log_probs = th.where(
72
+ x < -0.999,
73
+ log_cdf_plus,
74
+ th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
75
+ )
76
+ assert log_probs.shape == x.shape
77
+ return log_probs
guided_diffusion/nn.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Various utilities for neural networks.
3
+ """
4
+
5
+ import math
6
+
7
+ import torch as th
8
+ import torch.nn as nn
9
+
10
+
11
+ # PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
12
+ class SiLU(nn.Module):
13
+ def forward(self, x):
14
+ return x * th.sigmoid(x)
15
+
16
+
17
+ class GroupNorm32(nn.GroupNorm):
18
+ def forward(self, x):
19
+ return super().forward(x.float()).type(x.dtype)
20
+
21
+
22
+ def conv_nd(dims, *args, **kwargs):
23
+ """
24
+ Create a 1D, 2D, or 3D convolution module.
25
+ """
26
+ if dims == 1:
27
+ return nn.Conv1d(*args, **kwargs)
28
+ elif dims == 2:
29
+ return nn.Conv2d(*args, **kwargs)
30
+ elif dims == 3:
31
+ return nn.Conv3d(*args, **kwargs)
32
+ raise ValueError(f"unsupported dimensions: {dims}")
33
+
34
+
35
+ def linear(*args, **kwargs):
36
+ """
37
+ Create a linear module.
38
+ """
39
+ return nn.Linear(*args, **kwargs)
40
+
41
+
42
+ def avg_pool_nd(dims, *args, **kwargs):
43
+ """
44
+ Create a 1D, 2D, or 3D average pooling module.
45
+ """
46
+ if dims == 1:
47
+ return nn.AvgPool1d(*args, **kwargs)
48
+ elif dims == 2:
49
+ return nn.AvgPool2d(*args, **kwargs)
50
+ elif dims == 3:
51
+ return nn.AvgPool3d(*args, **kwargs)
52
+ raise ValueError(f"unsupported dimensions: {dims}")
53
+
54
+
55
+ def update_ema(target_params, source_params, rate=0.99):
56
+ """
57
+ Update target parameters to be closer to those of source parameters using
58
+ an exponential moving average.
59
+
60
+ :param target_params: the target parameter sequence.
61
+ :param source_params: the source parameter sequence.
62
+ :param rate: the EMA rate (closer to 1 means slower).
63
+ """
64
+ for targ, src in zip(target_params, source_params):
65
+ targ.detach().mul_(rate).add_(src, alpha=1 - rate)
66
+
67
+
68
+ def zero_module(module):
69
+ """
70
+ Zero out the parameters of a module and return it.
71
+ """
72
+ for p in module.parameters():
73
+ p.detach().zero_()
74
+ return module
75
+
76
+
77
+ def scale_module(module, scale):
78
+ """
79
+ Scale the parameters of a module and return it.
80
+ """
81
+ for p in module.parameters():
82
+ p.detach().mul_(scale)
83
+ return module
84
+
85
+
86
+ def mean_flat(tensor):
87
+ """
88
+ Take the mean over all non-batch dimensions.
89
+ """
90
+ return tensor.mean(dim=list(range(1, len(tensor.shape))))
91
+
92
+
93
+ def normalization(channels):
94
+ """
95
+ Make a standard normalization layer.
96
+
97
+ :param channels: number of input channels.
98
+ :return: an nn.Module for normalization.
99
+ """
100
+ return GroupNorm32(32, channels)
101
+
102
+
103
+ def timestep_embedding(timesteps, dim, max_period=10000):
104
+ """
105
+ Create sinusoidal timestep embeddings.
106
+
107
+ :param timesteps: a 1-D Tensor of N indices, one per batch element.
108
+ These may be fractional.
109
+ :param dim: the dimension of the output.
110
+ :param max_period: controls the minimum frequency of the embeddings.
111
+ :return: an [N x dim] Tensor of positional embeddings.
112
+ """
113
+ half = dim // 2
114
+ freqs = th.exp(
115
+ -math.log(max_period) * th.arange(start=0, end=half, dtype=th.float32) / half
116
+ ).to(device=timesteps.device)
117
+ args = timesteps[:, None].float() * freqs[None]
118
+ embedding = th.cat([th.cos(args), th.sin(args)], dim=-1)
119
+ if dim % 2:
120
+ embedding = th.cat([embedding, th.zeros_like(embedding[:, :1])], dim=-1)
121
+ return embedding
122
+
123
+
124
+ def checkpoint(func, inputs, params, flag):
125
+ """
126
+ Evaluate a function without caching intermediate activations, allowing for
127
+ reduced memory at the expense of extra compute in the backward pass.
128
+
129
+ :param func: the function to evaluate.
130
+ :param inputs: the argument sequence to pass to `func`.
131
+ :param params: a sequence of parameters `func` depends on but does not
132
+ explicitly take as arguments.
133
+ :param flag: if False, disable gradient checkpointing.
134
+ """
135
+ if flag:
136
+ args = tuple(inputs) + tuple(params)
137
+ return CheckpointFunction.apply(func, len(inputs), *args)
138
+ else:
139
+ return func(*inputs)
140
+
141
+
142
+ class CheckpointFunction(th.autograd.Function):
143
+ @staticmethod
144
+ def forward(ctx, run_function, length, *args):
145
+ ctx.run_function = run_function
146
+ ctx.input_tensors = list(args[:length])
147
+ ctx.input_params = list(args[length:])
148
+ with th.no_grad():
149
+ output_tensors = ctx.run_function(*ctx.input_tensors)
150
+ return output_tensors
151
+
152
+ @staticmethod
153
+ def backward(ctx, *output_grads):
154
+ ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
155
+ with th.enable_grad():
156
+ # Fixes a bug where the first op in run_function modifies the
157
+ # Tensor storage in place, which is not allowed for detach()'d
158
+ # Tensors.
159
+ shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
160
+ output_tensors = ctx.run_function(*shallow_copies)
161
+ input_grads = th.autograd.grad(
162
+ output_tensors,
163
+ ctx.input_tensors + ctx.input_params,
164
+ output_grads,
165
+ allow_unused=True,
166
+ )
167
+ del ctx.input_tensors
168
+ del ctx.input_params
169
+ del output_tensors
170
+ return (None, None) + input_grads
guided_diffusion/respace.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch as th
3
+
4
+ from .gaussian_diffusion import GaussianDiffusion
5
+
6
+
7
+ def space_timesteps(num_timesteps, section_counts):
8
+ """
9
+ Create a list of timesteps to use from an original diffusion process,
10
+ given the number of timesteps we want to take from equally-sized portions
11
+ of the original process.
12
+
13
+ For example, if there's 300 timesteps and the section counts are [10,15,20]
14
+ then the first 100 timesteps are strided to be 10 timesteps, the second 100
15
+ are strided to be 15 timesteps, and the final 100 are strided to be 20.
16
+
17
+ If the stride is a string starting with "ddim", then the fixed striding
18
+ from the DDIM paper is used, and only one section is allowed.
19
+
20
+ :param num_timesteps: the number of diffusion steps in the original
21
+ process to divide up.
22
+ :param section_counts: either a list of numbers, or a string containing
23
+ comma-separated numbers, indicating the step count
24
+ per section. As a special case, use "ddimN" where N
25
+ is a number of steps to use the striding from the
26
+ DDIM paper.
27
+ :return: a set of diffusion steps from the original process to use.
28
+ """
29
+ if isinstance(section_counts, str):
30
+ if section_counts.startswith("ddim"):
31
+ desired_count = int(section_counts[len("ddim") :])
32
+ for i in range(1, num_timesteps):
33
+ if len(range(0, num_timesteps, i)) == desired_count:
34
+ return set(range(0, num_timesteps, i))
35
+ raise ValueError(
36
+ f"cannot create exactly {num_timesteps} steps with an integer stride"
37
+ )
38
+ section_counts = [int(x) for x in section_counts.split(",")]
39
+ size_per = num_timesteps // len(section_counts)
40
+ extra = num_timesteps % len(section_counts)
41
+ start_idx = 0
42
+ all_steps = []
43
+ for i, section_count in enumerate(section_counts):
44
+ size = size_per + (1 if i < extra else 0)
45
+ if size < section_count:
46
+ raise ValueError(
47
+ f"cannot divide section of {size} steps into {section_count}"
48
+ )
49
+ if section_count <= 1:
50
+ frac_stride = 1
51
+ else:
52
+ frac_stride = (size - 1) / (section_count - 1)
53
+ cur_idx = 0.0
54
+ taken_steps = []
55
+ for _ in range(section_count):
56
+ taken_steps.append(start_idx + round(cur_idx))
57
+ cur_idx += frac_stride
58
+ all_steps += taken_steps
59
+ start_idx += size
60
+ return set(all_steps)
61
+
62
+
63
+ class SpacedDiffusion(GaussianDiffusion):
64
+ """
65
+ A diffusion process which can skip steps in a base diffusion process.
66
+
67
+ :param use_timesteps: a collection (sequence or set) of timesteps from the
68
+ original diffusion process to retain.
69
+ :param kwargs: the kwargs to create the base diffusion process.
70
+ """
71
+
72
+ def __init__(self, use_timesteps, **kwargs):
73
+ self.use_timesteps = set(use_timesteps)
74
+ self.timestep_map = []
75
+ self.original_num_steps = len(kwargs["betas"])
76
+
77
+ base_diffusion = GaussianDiffusion(**kwargs) # pylint: disable=missing-kwoa
78
+ last_alpha_cumprod = 1.0
79
+ new_betas = []
80
+ for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
81
+ if i in self.use_timesteps:
82
+ new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
83
+ last_alpha_cumprod = alpha_cumprod
84
+ self.timestep_map.append(i)
85
+ kwargs["betas"] = np.array(new_betas)
86
+ super().__init__(**kwargs)
87
+
88
+ def p_mean_variance(
89
+ self, model, *args, **kwargs
90
+ ): # pylint: disable=signature-differs
91
+ return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
92
+
93
+ def training_losses(
94
+ self, model, *args, **kwargs
95
+ ): # pylint: disable=signature-differs
96
+ return super().training_losses(self._wrap_model(model), *args, **kwargs)
97
+
98
+ def condition_mean(self, cond_fn, *args, **kwargs):
99
+ return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
100
+
101
+ def condition_score(self, cond_fn, *args, **kwargs):
102
+ return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
103
+
104
+ def _wrap_model(self, model):
105
+ if isinstance(model, _WrappedModel):
106
+ return model
107
+ return _WrappedModel(
108
+ model, self.timestep_map, self.rescale_timesteps, self.original_num_steps
109
+ )
110
+
111
+ def _scale_timesteps(self, t):
112
+ # Scaling is done by the wrapped model.
113
+ return t
114
+
115
+
116
+ class _WrappedModel:
117
+ def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps):
118
+ self.model = model
119
+ self.timestep_map = timestep_map
120
+ self.rescale_timesteps = rescale_timesteps
121
+ self.original_num_steps = original_num_steps
122
+
123
+ def __call__(self, x, ts, **kwargs):
124
+ map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
125
+ new_ts = map_tensor[ts]
126
+ if self.rescale_timesteps:
127
+ new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
128
+ return self.model(x, new_ts, **kwargs)
guided_diffusion/script_util.py ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import inspect
3
+
4
+ from . import gaussian_diffusion as gd
5
+ from .respace import SpacedDiffusion, space_timesteps
6
+ from .unet import SuperResModel, UNetModel, EncoderUNetModel
7
+
8
+ NUM_CLASSES = 1000
9
+
10
+
11
+ def diffusion_defaults():
12
+ """
13
+ Defaults for image and classifier training.
14
+ """
15
+ return dict(
16
+ learn_sigma=False,
17
+ diffusion_steps=1000,
18
+ noise_schedule="linear",
19
+ timestep_respacing="",
20
+ use_kl=False,
21
+ predict_xstart=False,
22
+ rescale_timesteps=False,
23
+ rescale_learned_sigmas=False,
24
+ )
25
+
26
+
27
+ def classifier_defaults():
28
+ """
29
+ Defaults for classifier models.
30
+ """
31
+ return dict(
32
+ image_size=64,
33
+ classifier_use_fp16=False,
34
+ classifier_width=128,
35
+ classifier_depth=2,
36
+ classifier_attention_resolutions="32,16,8", # 16
37
+ classifier_use_scale_shift_norm=True, # False
38
+ classifier_resblock_updown=True, # False
39
+ classifier_pool="attention",
40
+ )
41
+
42
+
43
+ def model_and_diffusion_defaults():
44
+ """
45
+ Defaults for image training.
46
+ """
47
+ res = dict(
48
+ image_size=64,
49
+ num_channels=128,
50
+ num_res_blocks=2,
51
+ num_heads=4,
52
+ num_heads_upsample=-1,
53
+ num_head_channels=-1,
54
+ attention_resolutions="16,8",
55
+ channel_mult="",
56
+ dropout=0.0,
57
+ class_cond=False,
58
+ use_checkpoint=False,
59
+ use_scale_shift_norm=True,
60
+ resblock_updown=False,
61
+ use_fp16=False,
62
+ use_new_attention_order=False,
63
+ )
64
+ res.update(diffusion_defaults())
65
+ return res
66
+
67
+
68
+ def classifier_and_diffusion_defaults():
69
+ res = classifier_defaults()
70
+ res.update(diffusion_defaults())
71
+ return res
72
+
73
+
74
+ def create_model_and_diffusion(
75
+ image_size,
76
+ class_cond,
77
+ learn_sigma,
78
+ num_channels,
79
+ num_res_blocks,
80
+ channel_mult,
81
+ num_heads,
82
+ num_head_channels,
83
+ num_heads_upsample,
84
+ attention_resolutions,
85
+ dropout,
86
+ diffusion_steps,
87
+ noise_schedule,
88
+ timestep_respacing,
89
+ use_kl,
90
+ predict_xstart,
91
+ rescale_timesteps,
92
+ rescale_learned_sigmas,
93
+ use_checkpoint,
94
+ use_scale_shift_norm,
95
+ resblock_updown,
96
+ use_fp16,
97
+ use_new_attention_order,
98
+ ):
99
+ model = create_model(
100
+ image_size,
101
+ num_channels,
102
+ num_res_blocks,
103
+ channel_mult=channel_mult,
104
+ learn_sigma=learn_sigma,
105
+ class_cond=class_cond,
106
+ use_checkpoint=use_checkpoint,
107
+ attention_resolutions=attention_resolutions,
108
+ num_heads=num_heads,
109
+ num_head_channels=num_head_channels,
110
+ num_heads_upsample=num_heads_upsample,
111
+ use_scale_shift_norm=use_scale_shift_norm,
112
+ dropout=dropout,
113
+ resblock_updown=resblock_updown,
114
+ use_fp16=use_fp16,
115
+ use_new_attention_order=use_new_attention_order,
116
+ )
117
+ diffusion = create_gaussian_diffusion(
118
+ steps=diffusion_steps,
119
+ learn_sigma=learn_sigma,
120
+ noise_schedule=noise_schedule,
121
+ use_kl=use_kl,
122
+ predict_xstart=predict_xstart,
123
+ rescale_timesteps=rescale_timesteps,
124
+ rescale_learned_sigmas=rescale_learned_sigmas,
125
+ timestep_respacing=timestep_respacing,
126
+ )
127
+ return model, diffusion
128
+
129
+
130
+ def create_model(
131
+ image_size,
132
+ num_channels,
133
+ num_res_blocks,
134
+ channel_mult="",
135
+ learn_sigma=False,
136
+ class_cond=False,
137
+ use_checkpoint=False,
138
+ attention_resolutions="16",
139
+ num_heads=1,
140
+ num_head_channels=-1,
141
+ num_heads_upsample=-1,
142
+ use_scale_shift_norm=False,
143
+ dropout=0,
144
+ resblock_updown=False,
145
+ use_fp16=False,
146
+ use_new_attention_order=False,
147
+ ):
148
+ if channel_mult == "":
149
+ if image_size == 512:
150
+ channel_mult = (0.5, 1, 1, 2, 2, 4, 4)
151
+ elif image_size == 256:
152
+ channel_mult = (1, 1, 2, 2, 4, 4)
153
+ elif image_size == 128:
154
+ channel_mult = (1, 1, 2, 3, 4)
155
+ elif image_size == 64:
156
+ channel_mult = (1, 2, 3, 4)
157
+ else:
158
+ raise ValueError(f"unsupported image size: {image_size}")
159
+ else:
160
+ channel_mult = tuple(int(ch_mult) for ch_mult in channel_mult.split(","))
161
+
162
+ attention_ds = []
163
+ for res in attention_resolutions.split(","):
164
+ attention_ds.append(image_size // int(res))
165
+
166
+ return UNetModel(
167
+ image_size=image_size,
168
+ in_channels=3,
169
+ model_channels=num_channels,
170
+ out_channels=(3 if not learn_sigma else 6),
171
+ num_res_blocks=num_res_blocks,
172
+ attention_resolutions=tuple(attention_ds),
173
+ dropout=dropout,
174
+ channel_mult=channel_mult,
175
+ num_classes=(NUM_CLASSES if class_cond else None),
176
+ use_checkpoint=use_checkpoint,
177
+ use_fp16=use_fp16,
178
+ num_heads=num_heads,
179
+ num_head_channels=num_head_channels,
180
+ num_heads_upsample=num_heads_upsample,
181
+ use_scale_shift_norm=use_scale_shift_norm,
182
+ resblock_updown=resblock_updown,
183
+ use_new_attention_order=use_new_attention_order,
184
+ )
185
+
186
+
187
+ def create_classifier_and_diffusion(
188
+ image_size,
189
+ classifier_use_fp16,
190
+ classifier_width,
191
+ classifier_depth,
192
+ classifier_attention_resolutions,
193
+ classifier_use_scale_shift_norm,
194
+ classifier_resblock_updown,
195
+ classifier_pool,
196
+ learn_sigma,
197
+ diffusion_steps,
198
+ noise_schedule,
199
+ timestep_respacing,
200
+ use_kl,
201
+ predict_xstart,
202
+ rescale_timesteps,
203
+ rescale_learned_sigmas,
204
+ ):
205
+ classifier = create_classifier(
206
+ image_size,
207
+ classifier_use_fp16,
208
+ classifier_width,
209
+ classifier_depth,
210
+ classifier_attention_resolutions,
211
+ classifier_use_scale_shift_norm,
212
+ classifier_resblock_updown,
213
+ classifier_pool,
214
+ )
215
+ diffusion = create_gaussian_diffusion(
216
+ steps=diffusion_steps,
217
+ learn_sigma=learn_sigma,
218
+ noise_schedule=noise_schedule,
219
+ use_kl=use_kl,
220
+ predict_xstart=predict_xstart,
221
+ rescale_timesteps=rescale_timesteps,
222
+ rescale_learned_sigmas=rescale_learned_sigmas,
223
+ timestep_respacing=timestep_respacing,
224
+ )
225
+ return classifier, diffusion
226
+
227
+
228
+ def create_classifier(
229
+ image_size,
230
+ classifier_use_fp16,
231
+ classifier_width,
232
+ classifier_depth,
233
+ classifier_attention_resolutions,
234
+ classifier_use_scale_shift_norm,
235
+ classifier_resblock_updown,
236
+ classifier_pool,
237
+ ):
238
+ if image_size == 512:
239
+ channel_mult = (0.5, 1, 1, 2, 2, 4, 4)
240
+ elif image_size == 256:
241
+ channel_mult = (1, 1, 2, 2, 4, 4)
242
+ elif image_size == 128:
243
+ channel_mult = (1, 1, 2, 3, 4)
244
+ elif image_size == 64:
245
+ channel_mult = (1, 2, 3, 4)
246
+ else:
247
+ raise ValueError(f"unsupported image size: {image_size}")
248
+
249
+ attention_ds = []
250
+ for res in classifier_attention_resolutions.split(","):
251
+ attention_ds.append(image_size // int(res))
252
+
253
+ return EncoderUNetModel(
254
+ image_size=image_size,
255
+ in_channels=3,
256
+ model_channels=classifier_width,
257
+ out_channels=1000,
258
+ num_res_blocks=classifier_depth,
259
+ attention_resolutions=tuple(attention_ds),
260
+ channel_mult=channel_mult,
261
+ use_fp16=classifier_use_fp16,
262
+ num_head_channels=64,
263
+ use_scale_shift_norm=classifier_use_scale_shift_norm,
264
+ resblock_updown=classifier_resblock_updown,
265
+ pool=classifier_pool,
266
+ )
267
+
268
+
269
+ def sr_model_and_diffusion_defaults():
270
+ res = model_and_diffusion_defaults()
271
+ res["large_size"] = 256
272
+ res["small_size"] = 64
273
+ arg_names = inspect.getfullargspec(sr_create_model_and_diffusion)[0]
274
+ for k in res.copy().keys():
275
+ if k not in arg_names:
276
+ del res[k]
277
+ return res
278
+
279
+
280
+ def sr_create_model_and_diffusion(
281
+ large_size,
282
+ small_size,
283
+ class_cond,
284
+ learn_sigma,
285
+ num_channels,
286
+ num_res_blocks,
287
+ num_heads,
288
+ num_head_channels,
289
+ num_heads_upsample,
290
+ attention_resolutions,
291
+ dropout,
292
+ diffusion_steps,
293
+ noise_schedule,
294
+ timestep_respacing,
295
+ use_kl,
296
+ predict_xstart,
297
+ rescale_timesteps,
298
+ rescale_learned_sigmas,
299
+ use_checkpoint,
300
+ use_scale_shift_norm,
301
+ resblock_updown,
302
+ use_fp16,
303
+ ):
304
+ model = sr_create_model(
305
+ large_size,
306
+ small_size,
307
+ num_channels,
308
+ num_res_blocks,
309
+ learn_sigma=learn_sigma,
310
+ class_cond=class_cond,
311
+ use_checkpoint=use_checkpoint,
312
+ attention_resolutions=attention_resolutions,
313
+ num_heads=num_heads,
314
+ num_head_channels=num_head_channels,
315
+ num_heads_upsample=num_heads_upsample,
316
+ use_scale_shift_norm=use_scale_shift_norm,
317
+ dropout=dropout,
318
+ resblock_updown=resblock_updown,
319
+ use_fp16=use_fp16,
320
+ )
321
+ diffusion = create_gaussian_diffusion(
322
+ steps=diffusion_steps,
323
+ learn_sigma=learn_sigma,
324
+ noise_schedule=noise_schedule,
325
+ use_kl=use_kl,
326
+ predict_xstart=predict_xstart,
327
+ rescale_timesteps=rescale_timesteps,
328
+ rescale_learned_sigmas=rescale_learned_sigmas,
329
+ timestep_respacing=timestep_respacing,
330
+ )
331
+ return model, diffusion
332
+
333
+
334
+ def sr_create_model(
335
+ large_size,
336
+ small_size,
337
+ num_channels,
338
+ num_res_blocks,
339
+ learn_sigma,
340
+ class_cond,
341
+ use_checkpoint,
342
+ attention_resolutions,
343
+ num_heads,
344
+ num_head_channels,
345
+ num_heads_upsample,
346
+ use_scale_shift_norm,
347
+ dropout,
348
+ resblock_updown,
349
+ use_fp16,
350
+ ):
351
+ _ = small_size # hack to prevent unused variable
352
+
353
+ if large_size == 512:
354
+ channel_mult = (1, 1, 2, 2, 4, 4)
355
+ elif large_size == 256:
356
+ channel_mult = (1, 1, 2, 2, 4, 4)
357
+ elif large_size == 64:
358
+ channel_mult = (1, 2, 3, 4)
359
+ else:
360
+ raise ValueError(f"unsupported large size: {large_size}")
361
+
362
+ attention_ds = []
363
+ for res in attention_resolutions.split(","):
364
+ attention_ds.append(large_size // int(res))
365
+
366
+ return SuperResModel(
367
+ image_size=large_size,
368
+ in_channels=3,
369
+ model_channels=num_channels,
370
+ out_channels=(3 if not learn_sigma else 6),
371
+ num_res_blocks=num_res_blocks,
372
+ attention_resolutions=tuple(attention_ds),
373
+ dropout=dropout,
374
+ channel_mult=channel_mult,
375
+ num_classes=(NUM_CLASSES if class_cond else None),
376
+ use_checkpoint=use_checkpoint,
377
+ num_heads=num_heads,
378
+ num_head_channels=num_head_channels,
379
+ num_heads_upsample=num_heads_upsample,
380
+ use_scale_shift_norm=use_scale_shift_norm,
381
+ resblock_updown=resblock_updown,
382
+ use_fp16=use_fp16,
383
+ )
384
+
385
+
386
+ def create_gaussian_diffusion(
387
+ *,
388
+ steps=1000,
389
+ learn_sigma=False,
390
+ sigma_small=False,
391
+ noise_schedule="linear",
392
+ use_kl=False,
393
+ predict_xstart=False,
394
+ rescale_timesteps=False,
395
+ rescale_learned_sigmas=False,
396
+ timestep_respacing="",
397
+ ):
398
+ betas = gd.get_named_beta_schedule(noise_schedule, steps)
399
+ if use_kl:
400
+ loss_type = gd.LossType.RESCALED_KL
401
+ elif rescale_learned_sigmas:
402
+ loss_type = gd.LossType.RESCALED_MSE
403
+ else:
404
+ loss_type = gd.LossType.MSE
405
+ if not timestep_respacing:
406
+ timestep_respacing = [steps]
407
+ return SpacedDiffusion(
408
+ use_timesteps=space_timesteps(steps, timestep_respacing),
409
+ betas=betas,
410
+ model_mean_type=(
411
+ gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
412
+ ),
413
+ model_var_type=(
414
+ (
415
+ gd.ModelVarType.FIXED_LARGE
416
+ if not sigma_small
417
+ else gd.ModelVarType.FIXED_SMALL
418
+ )
419
+ if not learn_sigma
420
+ else gd.ModelVarType.LEARNED_RANGE
421
+ ),
422
+ loss_type=loss_type,
423
+ rescale_timesteps=rescale_timesteps,
424
+ )
425
+
426
+
427
+ def add_dict_to_argparser(parser, default_dict):
428
+ for k, v in default_dict.items():
429
+ v_type = type(v)
430
+ if v is None:
431
+ v_type = str
432
+ elif isinstance(v, bool):
433
+ v_type = str2bool
434
+ parser.add_argument(f"--{k}", default=v, type=v_type)
435
+
436
+
437
+ def args_to_dict(args, keys):
438
+ return {k: getattr(args, k) for k in keys}
439
+
440
+
441
+ def str2bool(v):
442
+ """
443
+ https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
444
+ """
445
+ if isinstance(v, bool):
446
+ return v
447
+ if v.lower() in ("yes", "true", "t", "y", "1"):
448
+ return True
449
+ elif v.lower() in ("no", "false", "f", "n", "0"):
450
+ return False
451
+ else:
452
+ raise argparse.ArgumentTypeError("boolean value expected")
guided_diffusion/unet.py ADDED
@@ -0,0 +1,894 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import abstractmethod
2
+
3
+ import math
4
+
5
+ import numpy as np
6
+ import torch as th
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+
10
+ from .fp16_util import convert_module_to_f16, convert_module_to_f32
11
+ from .nn import (
12
+ checkpoint,
13
+ conv_nd,
14
+ linear,
15
+ avg_pool_nd,
16
+ zero_module,
17
+ normalization,
18
+ timestep_embedding,
19
+ )
20
+
21
+
22
+ class AttentionPool2d(nn.Module):
23
+ """
24
+ Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ spacial_dim: int,
30
+ embed_dim: int,
31
+ num_heads_channels: int,
32
+ output_dim: int = None,
33
+ ):
34
+ super().__init__()
35
+ self.positional_embedding = nn.Parameter(
36
+ th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5
37
+ )
38
+ self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
39
+ self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
40
+ self.num_heads = embed_dim // num_heads_channels
41
+ self.attention = QKVAttention(self.num_heads)
42
+
43
+ def forward(self, x):
44
+ b, c, *_spatial = x.shape
45
+ x = x.reshape(b, c, -1) # NC(HW)
46
+ x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1) # NC(HW+1)
47
+ x = x + self.positional_embedding[None, :, :].to(x.dtype) # NC(HW+1)
48
+ x = self.qkv_proj(x)
49
+ x = self.attention(x)
50
+ x = self.c_proj(x)
51
+ return x[:, :, 0]
52
+
53
+
54
+ class TimestepBlock(nn.Module):
55
+ """
56
+ Any module where forward() takes timestep embeddings as a second argument.
57
+ """
58
+
59
+ @abstractmethod
60
+ def forward(self, x, emb):
61
+ """
62
+ Apply the module to `x` given `emb` timestep embeddings.
63
+ """
64
+
65
+
66
+ class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
67
+ """
68
+ A sequential module that passes timestep embeddings to the children that
69
+ support it as an extra input.
70
+ """
71
+
72
+ def forward(self, x, emb):
73
+ for layer in self:
74
+ if isinstance(layer, TimestepBlock):
75
+ x = layer(x, emb)
76
+ else:
77
+ x = layer(x)
78
+ return x
79
+
80
+
81
+ class Upsample(nn.Module):
82
+ """
83
+ An upsampling layer with an optional convolution.
84
+
85
+ :param channels: channels in the inputs and outputs.
86
+ :param use_conv: a bool determining if a convolution is applied.
87
+ :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
88
+ upsampling occurs in the inner-two dimensions.
89
+ """
90
+
91
+ def __init__(self, channels, use_conv, dims=2, out_channels=None):
92
+ super().__init__()
93
+ self.channels = channels
94
+ self.out_channels = out_channels or channels
95
+ self.use_conv = use_conv
96
+ self.dims = dims
97
+ if use_conv:
98
+ self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=1)
99
+
100
+ def forward(self, x):
101
+ assert x.shape[1] == self.channels
102
+ if self.dims == 3:
103
+ x = F.interpolate(
104
+ x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
105
+ )
106
+ else:
107
+ x = F.interpolate(x, scale_factor=2, mode="nearest")
108
+ if self.use_conv:
109
+ x = self.conv(x)
110
+ return x
111
+
112
+
113
+ class Downsample(nn.Module):
114
+ """
115
+ A downsampling layer with an optional convolution.
116
+
117
+ :param channels: channels in the inputs and outputs.
118
+ :param use_conv: a bool determining if a convolution is applied.
119
+ :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
120
+ downsampling occurs in the inner-two dimensions.
121
+ """
122
+
123
+ def __init__(self, channels, use_conv, dims=2, out_channels=None):
124
+ super().__init__()
125
+ self.channels = channels
126
+ self.out_channels = out_channels or channels
127
+ self.use_conv = use_conv
128
+ self.dims = dims
129
+ stride = 2 if dims != 3 else (1, 2, 2)
130
+ if use_conv:
131
+ self.op = conv_nd(
132
+ dims, self.channels, self.out_channels, 3, stride=stride, padding=1
133
+ )
134
+ else:
135
+ assert self.channels == self.out_channels
136
+ self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
137
+
138
+ def forward(self, x):
139
+ assert x.shape[1] == self.channels
140
+ return self.op(x)
141
+
142
+
143
+ class ResBlock(TimestepBlock):
144
+ """
145
+ A residual block that can optionally change the number of channels.
146
+
147
+ :param channels: the number of input channels.
148
+ :param emb_channels: the number of timestep embedding channels.
149
+ :param dropout: the rate of dropout.
150
+ :param out_channels: if specified, the number of out channels.
151
+ :param use_conv: if True and out_channels is specified, use a spatial
152
+ convolution instead of a smaller 1x1 convolution to change the
153
+ channels in the skip connection.
154
+ :param dims: determines if the signal is 1D, 2D, or 3D.
155
+ :param use_checkpoint: if True, use gradient checkpointing on this module.
156
+ :param up: if True, use this block for upsampling.
157
+ :param down: if True, use this block for downsampling.
158
+ """
159
+
160
+ def __init__(
161
+ self,
162
+ channels,
163
+ emb_channels,
164
+ dropout,
165
+ out_channels=None,
166
+ use_conv=False,
167
+ use_scale_shift_norm=False,
168
+ dims=2,
169
+ use_checkpoint=False,
170
+ up=False,
171
+ down=False,
172
+ ):
173
+ super().__init__()
174
+ self.channels = channels
175
+ self.emb_channels = emb_channels
176
+ self.dropout = dropout
177
+ self.out_channels = out_channels or channels
178
+ self.use_conv = use_conv
179
+ self.use_checkpoint = use_checkpoint
180
+ self.use_scale_shift_norm = use_scale_shift_norm
181
+
182
+ self.in_layers = nn.Sequential(
183
+ normalization(channels),
184
+ nn.SiLU(),
185
+ conv_nd(dims, channels, self.out_channels, 3, padding=1),
186
+ )
187
+
188
+ self.updown = up or down
189
+
190
+ if up:
191
+ self.h_upd = Upsample(channels, False, dims)
192
+ self.x_upd = Upsample(channels, False, dims)
193
+ elif down:
194
+ self.h_upd = Downsample(channels, False, dims)
195
+ self.x_upd = Downsample(channels, False, dims)
196
+ else:
197
+ self.h_upd = self.x_upd = nn.Identity()
198
+
199
+ self.emb_layers = nn.Sequential(
200
+ nn.SiLU(),
201
+ linear(
202
+ emb_channels,
203
+ 2 * self.out_channels if use_scale_shift_norm else self.out_channels,
204
+ ),
205
+ )
206
+ self.out_layers = nn.Sequential(
207
+ normalization(self.out_channels),
208
+ nn.SiLU(),
209
+ nn.Dropout(p=dropout),
210
+ zero_module(
211
+ conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
212
+ ),
213
+ )
214
+
215
+ if self.out_channels == channels:
216
+ self.skip_connection = nn.Identity()
217
+ elif use_conv:
218
+ self.skip_connection = conv_nd(
219
+ dims, channels, self.out_channels, 3, padding=1
220
+ )
221
+ else:
222
+ self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
223
+
224
+ def forward(self, x, emb):
225
+ """
226
+ Apply the block to a Tensor, conditioned on a timestep embedding.
227
+
228
+ :param x: an [N x C x ...] Tensor of features.
229
+ :param emb: an [N x emb_channels] Tensor of timestep embeddings.
230
+ :return: an [N x C x ...] Tensor of outputs.
231
+ """
232
+ return checkpoint(
233
+ self._forward, (x, emb), self.parameters(), self.use_checkpoint
234
+ )
235
+
236
+ def _forward(self, x, emb):
237
+ if self.updown:
238
+ in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
239
+ h = in_rest(x)
240
+ h = self.h_upd(h)
241
+ x = self.x_upd(x)
242
+ h = in_conv(h)
243
+ else:
244
+ h = self.in_layers(x)
245
+ emb_out = self.emb_layers(emb).type(h.dtype)
246
+ while len(emb_out.shape) < len(h.shape):
247
+ emb_out = emb_out[..., None]
248
+ if self.use_scale_shift_norm:
249
+ out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
250
+ scale, shift = th.chunk(emb_out, 2, dim=1)
251
+ h = out_norm(h) * (1 + scale) + shift
252
+ h = out_rest(h)
253
+ else:
254
+ h = h + emb_out
255
+ h = self.out_layers(h)
256
+ return self.skip_connection(x) + h
257
+
258
+
259
+ class AttentionBlock(nn.Module):
260
+ """
261
+ An attention block that allows spatial positions to attend to each other.
262
+
263
+ Originally ported from here, but adapted to the N-d case.
264
+ https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
265
+ """
266
+
267
+ def __init__(
268
+ self,
269
+ channels,
270
+ num_heads=1,
271
+ num_head_channels=-1,
272
+ use_checkpoint=False,
273
+ use_new_attention_order=False,
274
+ ):
275
+ super().__init__()
276
+ self.channels = channels
277
+ if num_head_channels == -1:
278
+ self.num_heads = num_heads
279
+ else:
280
+ assert (
281
+ channels % num_head_channels == 0
282
+ ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
283
+ self.num_heads = channels // num_head_channels
284
+ self.use_checkpoint = use_checkpoint
285
+ self.norm = normalization(channels)
286
+ self.qkv = conv_nd(1, channels, channels * 3, 1)
287
+ if use_new_attention_order:
288
+ # split qkv before split heads
289
+ self.attention = QKVAttention(self.num_heads)
290
+ else:
291
+ # split heads before split qkv
292
+ self.attention = QKVAttentionLegacy(self.num_heads)
293
+
294
+ self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
295
+
296
+ def forward(self, x):
297
+ return checkpoint(self._forward, (x,), self.parameters(), True)
298
+
299
+ def _forward(self, x):
300
+ b, c, *spatial = x.shape
301
+ x = x.reshape(b, c, -1)
302
+ qkv = self.qkv(self.norm(x))
303
+ h = self.attention(qkv)
304
+ h = self.proj_out(h)
305
+ return (x + h).reshape(b, c, *spatial)
306
+
307
+
308
+ def count_flops_attn(model, _x, y):
309
+ """
310
+ A counter for the `thop` package to count the operations in an
311
+ attention operation.
312
+ Meant to be used like:
313
+ macs, params = thop.profile(
314
+ model,
315
+ inputs=(inputs, timestamps),
316
+ custom_ops={QKVAttention: QKVAttention.count_flops},
317
+ )
318
+ """
319
+ b, c, *spatial = y[0].shape
320
+ num_spatial = int(np.prod(spatial))
321
+ # We perform two matmuls with the same number of ops.
322
+ # The first computes the weight matrix, the second computes
323
+ # the combination of the value vectors.
324
+ matmul_ops = 2 * b * (num_spatial ** 2) * c
325
+ model.total_ops += th.DoubleTensor([matmul_ops])
326
+
327
+
328
+ class QKVAttentionLegacy(nn.Module):
329
+ """
330
+ A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
331
+ """
332
+
333
+ def __init__(self, n_heads):
334
+ super().__init__()
335
+ self.n_heads = n_heads
336
+
337
+ def forward(self, qkv):
338
+ """
339
+ Apply QKV attention.
340
+
341
+ :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
342
+ :return: an [N x (H * C) x T] tensor after attention.
343
+ """
344
+ bs, width, length = qkv.shape
345
+ assert width % (3 * self.n_heads) == 0
346
+ ch = width // (3 * self.n_heads)
347
+ q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
348
+ scale = 1 / math.sqrt(math.sqrt(ch))
349
+ weight = th.einsum(
350
+ "bct,bcs->bts", q * scale, k * scale
351
+ ) # More stable with f16 than dividing afterwards
352
+ weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
353
+ a = th.einsum("bts,bcs->bct", weight, v)
354
+ return a.reshape(bs, -1, length)
355
+
356
+ @staticmethod
357
+ def count_flops(model, _x, y):
358
+ return count_flops_attn(model, _x, y)
359
+
360
+
361
+ class QKVAttention(nn.Module):
362
+ """
363
+ A module which performs QKV attention and splits in a different order.
364
+ """
365
+
366
+ def __init__(self, n_heads):
367
+ super().__init__()
368
+ self.n_heads = n_heads
369
+
370
+ def forward(self, qkv):
371
+ """
372
+ Apply QKV attention.
373
+
374
+ :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
375
+ :return: an [N x (H * C) x T] tensor after attention.
376
+ """
377
+ bs, width, length = qkv.shape
378
+ assert width % (3 * self.n_heads) == 0
379
+ ch = width // (3 * self.n_heads)
380
+ q, k, v = qkv.chunk(3, dim=1)
381
+ scale = 1 / math.sqrt(math.sqrt(ch))
382
+ weight = th.einsum(
383
+ "bct,bcs->bts",
384
+ (q * scale).view(bs * self.n_heads, ch, length),
385
+ (k * scale).view(bs * self.n_heads, ch, length),
386
+ ) # More stable with f16 than dividing afterwards
387
+ weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
388
+ a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
389
+ return a.reshape(bs, -1, length)
390
+
391
+ @staticmethod
392
+ def count_flops(model, _x, y):
393
+ return count_flops_attn(model, _x, y)
394
+
395
+
396
+ class UNetModel(nn.Module):
397
+ """
398
+ The full UNet model with attention and timestep embedding.
399
+
400
+ :param in_channels: channels in the input Tensor.
401
+ :param model_channels: base channel count for the model.
402
+ :param out_channels: channels in the output Tensor.
403
+ :param num_res_blocks: number of residual blocks per downsample.
404
+ :param attention_resolutions: a collection of downsample rates at which
405
+ attention will take place. May be a set, list, or tuple.
406
+ For example, if this contains 4, then at 4x downsampling, attention
407
+ will be used.
408
+ :param dropout: the dropout probability.
409
+ :param channel_mult: channel multiplier for each level of the UNet.
410
+ :param conv_resample: if True, use learned convolutions for upsampling and
411
+ downsampling.
412
+ :param dims: determines if the signal is 1D, 2D, or 3D.
413
+ :param num_classes: if specified (as an int), then this model will be
414
+ class-conditional with `num_classes` classes.
415
+ :param use_checkpoint: use gradient checkpointing to reduce memory usage.
416
+ :param num_heads: the number of attention heads in each attention layer.
417
+ :param num_heads_channels: if specified, ignore num_heads and instead use
418
+ a fixed channel width per attention head.
419
+ :param num_heads_upsample: works with num_heads to set a different number
420
+ of heads for upsampling. Deprecated.
421
+ :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
422
+ :param resblock_updown: use residual blocks for up/downsampling.
423
+ :param use_new_attention_order: use a different attention pattern for potentially
424
+ increased efficiency.
425
+ """
426
+
427
+ def __init__(
428
+ self,
429
+ image_size,
430
+ in_channels,
431
+ model_channels,
432
+ out_channels,
433
+ num_res_blocks,
434
+ attention_resolutions,
435
+ dropout=0,
436
+ channel_mult=(1, 2, 4, 8),
437
+ conv_resample=True,
438
+ dims=2,
439
+ num_classes=None,
440
+ use_checkpoint=False,
441
+ use_fp16=False,
442
+ num_heads=1,
443
+ num_head_channels=-1,
444
+ num_heads_upsample=-1,
445
+ use_scale_shift_norm=False,
446
+ resblock_updown=False,
447
+ use_new_attention_order=False,
448
+ ):
449
+ super().__init__()
450
+
451
+ if num_heads_upsample == -1:
452
+ num_heads_upsample = num_heads
453
+
454
+ self.image_size = image_size
455
+ self.in_channels = in_channels
456
+ self.model_channels = model_channels
457
+ self.out_channels = out_channels
458
+ self.num_res_blocks = num_res_blocks
459
+ self.attention_resolutions = attention_resolutions
460
+ self.dropout = dropout
461
+ self.channel_mult = channel_mult
462
+ self.conv_resample = conv_resample
463
+ self.num_classes = num_classes
464
+ self.use_checkpoint = use_checkpoint
465
+ self.dtype = th.float16 if use_fp16 else th.float32
466
+ self.num_heads = num_heads
467
+ self.num_head_channels = num_head_channels
468
+ self.num_heads_upsample = num_heads_upsample
469
+
470
+ time_embed_dim = model_channels * 4
471
+ self.time_embed = nn.Sequential(
472
+ linear(model_channels, time_embed_dim),
473
+ nn.SiLU(),
474
+ linear(time_embed_dim, time_embed_dim),
475
+ )
476
+
477
+ if self.num_classes is not None:
478
+ self.label_emb = nn.Embedding(num_classes, time_embed_dim)
479
+
480
+ ch = input_ch = int(channel_mult[0] * model_channels)
481
+ self.input_blocks = nn.ModuleList(
482
+ [TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))]
483
+ )
484
+ self._feature_size = ch
485
+ input_block_chans = [ch]
486
+ ds = 1
487
+ for level, mult in enumerate(channel_mult):
488
+ for _ in range(num_res_blocks):
489
+ layers = [
490
+ ResBlock(
491
+ ch,
492
+ time_embed_dim,
493
+ dropout,
494
+ out_channels=int(mult * model_channels),
495
+ dims=dims,
496
+ use_checkpoint=use_checkpoint,
497
+ use_scale_shift_norm=use_scale_shift_norm,
498
+ )
499
+ ]
500
+ ch = int(mult * model_channels)
501
+ if ds in attention_resolutions:
502
+ layers.append(
503
+ AttentionBlock(
504
+ ch,
505
+ use_checkpoint=use_checkpoint,
506
+ num_heads=num_heads,
507
+ num_head_channels=num_head_channels,
508
+ use_new_attention_order=use_new_attention_order,
509
+ )
510
+ )
511
+ self.input_blocks.append(TimestepEmbedSequential(*layers))
512
+ self._feature_size += ch
513
+ input_block_chans.append(ch)
514
+ if level != len(channel_mult) - 1:
515
+ out_ch = ch
516
+ self.input_blocks.append(
517
+ TimestepEmbedSequential(
518
+ ResBlock(
519
+ ch,
520
+ time_embed_dim,
521
+ dropout,
522
+ out_channels=out_ch,
523
+ dims=dims,
524
+ use_checkpoint=use_checkpoint,
525
+ use_scale_shift_norm=use_scale_shift_norm,
526
+ down=True,
527
+ )
528
+ if resblock_updown
529
+ else Downsample(
530
+ ch, conv_resample, dims=dims, out_channels=out_ch
531
+ )
532
+ )
533
+ )
534
+ ch = out_ch
535
+ input_block_chans.append(ch)
536
+ ds *= 2
537
+ self._feature_size += ch
538
+
539
+ self.middle_block = TimestepEmbedSequential(
540
+ ResBlock(
541
+ ch,
542
+ time_embed_dim,
543
+ dropout,
544
+ dims=dims,
545
+ use_checkpoint=use_checkpoint,
546
+ use_scale_shift_norm=use_scale_shift_norm,
547
+ ),
548
+ AttentionBlock(
549
+ ch,
550
+ use_checkpoint=use_checkpoint,
551
+ num_heads=num_heads,
552
+ num_head_channels=num_head_channels,
553
+ use_new_attention_order=use_new_attention_order,
554
+ ),
555
+ ResBlock(
556
+ ch,
557
+ time_embed_dim,
558
+ dropout,
559
+ dims=dims,
560
+ use_checkpoint=use_checkpoint,
561
+ use_scale_shift_norm=use_scale_shift_norm,
562
+ ),
563
+ )
564
+ self._feature_size += ch
565
+
566
+ self.output_blocks = nn.ModuleList([])
567
+ for level, mult in list(enumerate(channel_mult))[::-1]:
568
+ for i in range(num_res_blocks + 1):
569
+ ich = input_block_chans.pop()
570
+ layers = [
571
+ ResBlock(
572
+ ch + ich,
573
+ time_embed_dim,
574
+ dropout,
575
+ out_channels=int(model_channels * mult),
576
+ dims=dims,
577
+ use_checkpoint=use_checkpoint,
578
+ use_scale_shift_norm=use_scale_shift_norm,
579
+ )
580
+ ]
581
+ ch = int(model_channels * mult)
582
+ if ds in attention_resolutions:
583
+ layers.append(
584
+ AttentionBlock(
585
+ ch,
586
+ use_checkpoint=use_checkpoint,
587
+ num_heads=num_heads_upsample,
588
+ num_head_channels=num_head_channels,
589
+ use_new_attention_order=use_new_attention_order,
590
+ )
591
+ )
592
+ if level and i == num_res_blocks:
593
+ out_ch = ch
594
+ layers.append(
595
+ ResBlock(
596
+ ch,
597
+ time_embed_dim,
598
+ dropout,
599
+ out_channels=out_ch,
600
+ dims=dims,
601
+ use_checkpoint=use_checkpoint,
602
+ use_scale_shift_norm=use_scale_shift_norm,
603
+ up=True,
604
+ )
605
+ if resblock_updown
606
+ else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
607
+ )
608
+ ds //= 2
609
+ self.output_blocks.append(TimestepEmbedSequential(*layers))
610
+ self._feature_size += ch
611
+
612
+ self.out = nn.Sequential(
613
+ normalization(ch),
614
+ nn.SiLU(),
615
+ zero_module(conv_nd(dims, input_ch, out_channels, 3, padding=1)),
616
+ )
617
+
618
+ def convert_to_fp16(self):
619
+ """
620
+ Convert the torso of the model to float16.
621
+ """
622
+ self.input_blocks.apply(convert_module_to_f16)
623
+ self.middle_block.apply(convert_module_to_f16)
624
+ self.output_blocks.apply(convert_module_to_f16)
625
+
626
+ def convert_to_fp32(self):
627
+ """
628
+ Convert the torso of the model to float32.
629
+ """
630
+ self.input_blocks.apply(convert_module_to_f32)
631
+ self.middle_block.apply(convert_module_to_f32)
632
+ self.output_blocks.apply(convert_module_to_f32)
633
+
634
+ def forward(self, x, timesteps, y=None):
635
+ """
636
+ Apply the model to an input batch.
637
+
638
+ :param x: an [N x C x ...] Tensor of inputs.
639
+ :param timesteps: a 1-D batch of timesteps.
640
+ :param y: an [N] Tensor of labels, if class-conditional.
641
+ :return: an [N x C x ...] Tensor of outputs.
642
+ """
643
+ assert (y is not None) == (
644
+ self.num_classes is not None
645
+ ), "must specify y if and only if the model is class-conditional"
646
+
647
+ hs = []
648
+ emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
649
+
650
+ if self.num_classes is not None:
651
+ assert y.shape == (x.shape[0],)
652
+ emb = emb + self.label_emb(y)
653
+
654
+ h = x.type(self.dtype)
655
+ for module in self.input_blocks:
656
+ h = module(h, emb)
657
+ hs.append(h)
658
+ h = self.middle_block(h, emb)
659
+ for module in self.output_blocks:
660
+ h = th.cat([h, hs.pop()], dim=1)
661
+ h = module(h, emb)
662
+ h = h.type(x.dtype)
663
+ return self.out(h)
664
+
665
+
666
+ class SuperResModel(UNetModel):
667
+ """
668
+ A UNetModel that performs super-resolution.
669
+
670
+ Expects an extra kwarg `low_res` to condition on a low-resolution image.
671
+ """
672
+
673
+ def __init__(self, image_size, in_channels, *args, **kwargs):
674
+ super().__init__(image_size, in_channels * 2, *args, **kwargs)
675
+
676
+ def forward(self, x, timesteps, low_res=None, **kwargs):
677
+ _, _, new_height, new_width = x.shape
678
+ upsampled = F.interpolate(low_res, (new_height, new_width), mode="bilinear")
679
+ x = th.cat([x, upsampled], dim=1)
680
+ return super().forward(x, timesteps, **kwargs)
681
+
682
+
683
+ class EncoderUNetModel(nn.Module):
684
+ """
685
+ The half UNet model with attention and timestep embedding.
686
+
687
+ For usage, see UNet.
688
+ """
689
+
690
+ def __init__(
691
+ self,
692
+ image_size,
693
+ in_channels,
694
+ model_channels,
695
+ out_channels,
696
+ num_res_blocks,
697
+ attention_resolutions,
698
+ dropout=0,
699
+ channel_mult=(1, 2, 4, 8),
700
+ conv_resample=True,
701
+ dims=2,
702
+ use_checkpoint=False,
703
+ use_fp16=False,
704
+ num_heads=1,
705
+ num_head_channels=-1,
706
+ num_heads_upsample=-1,
707
+ use_scale_shift_norm=False,
708
+ resblock_updown=False,
709
+ use_new_attention_order=False,
710
+ pool="adaptive",
711
+ ):
712
+ super().__init__()
713
+
714
+ if num_heads_upsample == -1:
715
+ num_heads_upsample = num_heads
716
+
717
+ self.in_channels = in_channels
718
+ self.model_channels = model_channels
719
+ self.out_channels = out_channels
720
+ self.num_res_blocks = num_res_blocks
721
+ self.attention_resolutions = attention_resolutions
722
+ self.dropout = dropout
723
+ self.channel_mult = channel_mult
724
+ self.conv_resample = conv_resample
725
+ self.use_checkpoint = use_checkpoint
726
+ self.dtype = th.float16 if use_fp16 else th.float32
727
+ self.num_heads = num_heads
728
+ self.num_head_channels = num_head_channels
729
+ self.num_heads_upsample = num_heads_upsample
730
+
731
+ time_embed_dim = model_channels * 4
732
+ self.time_embed = nn.Sequential(
733
+ linear(model_channels, time_embed_dim),
734
+ nn.SiLU(),
735
+ linear(time_embed_dim, time_embed_dim),
736
+ )
737
+
738
+ ch = int(channel_mult[0] * model_channels)
739
+ self.input_blocks = nn.ModuleList(
740
+ [TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))]
741
+ )
742
+ self._feature_size = ch
743
+ input_block_chans = [ch]
744
+ ds = 1
745
+ for level, mult in enumerate(channel_mult):
746
+ for _ in range(num_res_blocks):
747
+ layers = [
748
+ ResBlock(
749
+ ch,
750
+ time_embed_dim,
751
+ dropout,
752
+ out_channels=int(mult * model_channels),
753
+ dims=dims,
754
+ use_checkpoint=use_checkpoint,
755
+ use_scale_shift_norm=use_scale_shift_norm,
756
+ )
757
+ ]
758
+ ch = int(mult * model_channels)
759
+ if ds in attention_resolutions:
760
+ layers.append(
761
+ AttentionBlock(
762
+ ch,
763
+ use_checkpoint=use_checkpoint,
764
+ num_heads=num_heads,
765
+ num_head_channels=num_head_channels,
766
+ use_new_attention_order=use_new_attention_order,
767
+ )
768
+ )
769
+ self.input_blocks.append(TimestepEmbedSequential(*layers))
770
+ self._feature_size += ch
771
+ input_block_chans.append(ch)
772
+ if level != len(channel_mult) - 1:
773
+ out_ch = ch
774
+ self.input_blocks.append(
775
+ TimestepEmbedSequential(
776
+ ResBlock(
777
+ ch,
778
+ time_embed_dim,
779
+ dropout,
780
+ out_channels=out_ch,
781
+ dims=dims,
782
+ use_checkpoint=use_checkpoint,
783
+ use_scale_shift_norm=use_scale_shift_norm,
784
+ down=True,
785
+ )
786
+ if resblock_updown
787
+ else Downsample(
788
+ ch, conv_resample, dims=dims, out_channels=out_ch
789
+ )
790
+ )
791
+ )
792
+ ch = out_ch
793
+ input_block_chans.append(ch)
794
+ ds *= 2
795
+ self._feature_size += ch
796
+
797
+ self.middle_block = TimestepEmbedSequential(
798
+ ResBlock(
799
+ ch,
800
+ time_embed_dim,
801
+ dropout,
802
+ dims=dims,
803
+ use_checkpoint=use_checkpoint,
804
+ use_scale_shift_norm=use_scale_shift_norm,
805
+ ),
806
+ AttentionBlock(
807
+ ch,
808
+ use_checkpoint=use_checkpoint,
809
+ num_heads=num_heads,
810
+ num_head_channels=num_head_channels,
811
+ use_new_attention_order=use_new_attention_order,
812
+ ),
813
+ ResBlock(
814
+ ch,
815
+ time_embed_dim,
816
+ dropout,
817
+ dims=dims,
818
+ use_checkpoint=use_checkpoint,
819
+ use_scale_shift_norm=use_scale_shift_norm,
820
+ ),
821
+ )
822
+ self._feature_size += ch
823
+ self.pool = pool
824
+ if pool == "adaptive":
825
+ self.out = nn.Sequential(
826
+ normalization(ch),
827
+ nn.SiLU(),
828
+ nn.AdaptiveAvgPool2d((1, 1)),
829
+ zero_module(conv_nd(dims, ch, out_channels, 1)),
830
+ nn.Flatten(),
831
+ )
832
+ elif pool == "attention":
833
+ assert num_head_channels != -1
834
+ self.out = nn.Sequential(
835
+ normalization(ch),
836
+ nn.SiLU(),
837
+ AttentionPool2d(
838
+ (image_size // ds), ch, num_head_channels, out_channels
839
+ ),
840
+ )
841
+ elif pool == "spatial":
842
+ self.out = nn.Sequential(
843
+ nn.Linear(self._feature_size, 2048),
844
+ nn.ReLU(),
845
+ nn.Linear(2048, self.out_channels),
846
+ )
847
+ elif pool == "spatial_v2":
848
+ self.out = nn.Sequential(
849
+ nn.Linear(self._feature_size, 2048),
850
+ normalization(2048),
851
+ nn.SiLU(),
852
+ nn.Linear(2048, self.out_channels),
853
+ )
854
+ else:
855
+ raise NotImplementedError(f"Unexpected {pool} pooling")
856
+
857
+ def convert_to_fp16(self):
858
+ """
859
+ Convert the torso of the model to float16.
860
+ """
861
+ self.input_blocks.apply(convert_module_to_f16)
862
+ self.middle_block.apply(convert_module_to_f16)
863
+
864
+ def convert_to_fp32(self):
865
+ """
866
+ Convert the torso of the model to float32.
867
+ """
868
+ self.input_blocks.apply(convert_module_to_f32)
869
+ self.middle_block.apply(convert_module_to_f32)
870
+
871
+ def forward(self, x, timesteps):
872
+ """
873
+ Apply the model to an input batch.
874
+
875
+ :param x: an [N x C x ...] Tensor of inputs.
876
+ :param timesteps: a 1-D batch of timesteps.
877
+ :return: an [N x K] Tensor of outputs.
878
+ """
879
+ emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
880
+
881
+ results = []
882
+ h = x.type(self.dtype)
883
+ for module in self.input_blocks:
884
+ h = module(h, emb)
885
+ if self.pool.startswith("spatial"):
886
+ results.append(h.type(x.dtype).mean(dim=(2, 3)))
887
+ h = self.middle_block(h, emb)
888
+ if self.pool.startswith("spatial"):
889
+ results.append(h.type(x.dtype).mean(dim=(2, 3)))
890
+ h = th.cat(results, axis=-1)
891
+ return self.out(h)
892
+ else:
893
+ h = h.type(x.dtype)
894
+ return self.out(h)
highres_final_vis.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from einops import rearrange
4
+
5
+ from voxnerf.render import subpixel_rays_from_img
6
+
7
+ from run_sjc import (
8
+ SJC, ScoreAdapter, StableDiffusion,
9
+ tqdm, EventStorage, HeartBeat, EarlyLoopBreak, get_event_storage, get_heartbeat, optional_load_config, read_stats,
10
+ vis_routine, stitch_vis, latest_ckpt,
11
+ scene_box_filter, render_ray_bundle, as_torch_tsrs,
12
+ device_glb
13
+ )
14
+
15
+
16
+ # the SD deocder is very memory hungry; the latent image cannot be too large
17
+ # for a graphics card with < 12 GB memory, set this to 128; quality already good
18
+ # if your card has 12 to 24 GB memory, you can set this to 200;
19
+ # but visually it won't help beyond a certain point. Our teaser is done with 128.
20
+ decoder_bottleneck_hw = 128
21
+
22
+
23
+ def final_vis():
24
+ cfg = optional_load_config(fname="full_config.yml")
25
+ assert len(cfg) > 0, "can't find cfg file"
26
+ mod = SJC(**cfg)
27
+
28
+ family = cfg.pop("family")
29
+ model: ScoreAdapter = getattr(mod, family).make()
30
+ vox = mod.vox.make()
31
+ poser = mod.pose.make()
32
+
33
+ pbar = tqdm(range(1))
34
+
35
+ with EventStorage(), HeartBeat(pbar):
36
+ ckpt_fname = latest_ckpt()
37
+ state = torch.load(ckpt_fname, map_location="cpu")
38
+ vox.load_state_dict(state)
39
+ vox.to(device_glb)
40
+
41
+ with EventStorage("highres"):
42
+ # what dominates the speed is NOT the factor here.
43
+ # you can try from 2 to 8, and the speed is about the same.
44
+ # the dominating factor in the pipeline I believe is the SD decoder.
45
+ evaluate(model, vox, poser, n_frames=200, factor=4)
46
+
47
+
48
+ @torch.no_grad()
49
+ def evaluate(score_model, vox, poser, n_frames=200, factor=4):
50
+ H, W = poser.H, poser.W
51
+ vox.eval()
52
+ K, poses = poser.sample_test(n_frames)
53
+ del n_frames
54
+ poses = poses[60:] # skip the full overhead view; not interesting
55
+
56
+ fuse = EarlyLoopBreak(5)
57
+ metric = get_event_storage()
58
+ hbeat = get_heartbeat()
59
+
60
+ aabb = vox.aabb.T.cpu().numpy()
61
+ vox = vox.to(device_glb)
62
+
63
+ num_imgs = len(poses)
64
+
65
+ for i in (pbar := tqdm(range(num_imgs))):
66
+ if fuse.on_break():
67
+ break
68
+
69
+ pose = poses[i]
70
+ y, depth = highres_render_one_view(vox, aabb, H, W, K, pose, f=factor)
71
+ if isinstance(score_model, StableDiffusion):
72
+ y = score_model.decode(y)
73
+ vis_routine(metric, y, depth)
74
+
75
+ metric.step()
76
+ hbeat.beat()
77
+
78
+ metric.flush_history()
79
+
80
+ metric.put_artifact(
81
+ "movie_im_and_depth", ".mp4",
82
+ lambda fn: stitch_vis(fn, read_stats(metric.output_dir, "view")[1])
83
+ )
84
+
85
+ metric.put_artifact(
86
+ "movie_im_only", ".mp4",
87
+ lambda fn: stitch_vis(fn, read_stats(metric.output_dir, "img")[1])
88
+ )
89
+
90
+ metric.step()
91
+
92
+
93
+ def highres_render_one_view(vox, aabb, H, W, K, pose, f=4):
94
+ bs = 4096
95
+
96
+ ro, rd = subpixel_rays_from_img(H, W, K, pose, f=f)
97
+ ro, rd, t_min, t_max = scene_box_filter(ro, rd, aabb)
98
+ n = len(ro)
99
+ ro, rd, t_min, t_max = as_torch_tsrs(vox.device, ro, rd, t_min, t_max)
100
+
101
+ rgbs = torch.zeros(n, 4, device=vox.device)
102
+ depth = torch.zeros(n, 1, device=vox.device)
103
+
104
+ with torch.no_grad():
105
+ for i in range(int(np.ceil(n / bs))):
106
+ s = i * bs
107
+ e = min(n, s + bs)
108
+ _rgbs, _depth, _ = render_ray_bundle(
109
+ vox, ro[s:e], rd[s:e], t_min[s:e], t_max[s:e]
110
+ )
111
+ rgbs[s:e] = _rgbs
112
+ depth[s:e] = _depth
113
+
114
+ rgbs = rearrange(rgbs, "(h w) c -> 1 c h w", h=H*f, w=W*f)
115
+ depth = rearrange(depth, "(h w) 1 -> h w", h=H*f, w=W*f)
116
+ rgbs = torch.nn.functional.interpolate(
117
+ rgbs, (decoder_bottleneck_hw, decoder_bottleneck_hw),
118
+ mode='bilinear', antialias=True
119
+ )
120
+ return rgbs, depth
121
+
122
+
123
+ if __name__ == "__main__":
124
+ final_vis()
misc.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+
4
+
5
+ def torch_samps_to_imgs(imgs, uncenter=True):
6
+ if uncenter:
7
+ imgs = (imgs + 1) / 2 # [-1, 1] -> [0, 1]
8
+ imgs = (imgs * 255).clamp(0, 255)
9
+ imgs = imgs.to(torch.uint8)
10
+ imgs = imgs.permute(0, 2, 3, 1)
11
+ imgs = imgs.cpu().numpy()
12
+ return imgs
13
+
14
+
15
+ def imgs_to_torch(imgs):
16
+ assert imgs.dtype == np.uint8
17
+ assert len(imgs.shape) == 4 and imgs.shape[-1] == 3, "expect (N, H, W, C)"
18
+ _, H, W, _ = imgs.shape
19
+
20
+ imgs = imgs.transpose(0, 3, 1, 2)
21
+ imgs = (imgs / 255).astype(np.float32)
22
+ imgs = (imgs * 2) - 1
23
+ imgs = torch.as_tensor(imgs)
24
+ H, W = [_l - (_l % 32) for _l in (H, W)]
25
+ imgs = torch.nn.functional.interpolate(imgs, (H, W), mode="bilinear")
26
+ return imgs
27
+
28
+
29
+ def test_encode_decode():
30
+ import imageio
31
+ from run_img_sampling import ScoreAdapter, SD
32
+ from vis import _draw
33
+
34
+ fname = "~/clean.png"
35
+ raw = imageio.imread(fname)
36
+ raw = imgs_to_torch(raw[np.newaxis, ...])
37
+
38
+ model: ScoreAdapter = SD().run()
39
+ raw = raw.to(model.device)
40
+ zs = model.encode(raw)
41
+ img = model.decode(zs)
42
+ img = torch_samps_to_imgs(img)
43
+ _draw(
44
+ [imageio.imread(fname), img.squeeze(0)],
45
+ )
46
+
47
+
48
+ def test():
49
+ test_encode_decode()
50
+
51
+
52
+ if __name__ == "__main__":
53
+ test()
my/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ a personal tookit for experiment management;
2
+ some of the designs patterns are inspired by detectron2
my/__init__.py ADDED
File without changes
my/config.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union
2
+ from copy import deepcopy
3
+ from collections import namedtuple
4
+ from pathlib import Path
5
+ import argparse
6
+ from argparse import RawDescriptionHelpFormatter
7
+ import yaml
8
+ from pydantic import BaseModel as _Base
9
+
10
+
11
+ class BaseConf(_Base):
12
+ class Config:
13
+ validate_all = True
14
+ allow_mutation = True
15
+ extra = "ignore"
16
+
17
+
18
+ def SingleOrList(inner_type):
19
+ return Union[inner_type, List[inner_type]]
20
+
21
+
22
+ def optional_load_config(fname="config.yml"):
23
+ cfg = {}
24
+ conf_fname = Path.cwd() / fname
25
+ if conf_fname.is_file():
26
+ with conf_fname.open("r") as f:
27
+ raw = f.read()
28
+ print("loaded config\n ")
29
+ print(raw) # yaml raw itself is well formatted
30
+ cfg = yaml.safe_load(raw)
31
+ return cfg
32
+
33
+
34
+ def write_full_config(cfg_obj, fname="full_config.yml"):
35
+ cfg = cfg_obj.dict()
36
+ cfg = _dict_to_yaml(cfg)
37
+ print(f"\n--- full config ---\n\n{cfg}\n")
38
+ with (Path.cwd() / fname).open("w") as f:
39
+ f.write(cfg)
40
+
41
+
42
+ def argparse_cfg_template(curr_cfgs):
43
+ parser = argparse.ArgumentParser(
44
+ description='Manual spec of configs',
45
+ epilog=f'curr cfgs:\n\n{_dict_to_yaml(curr_cfgs)}',
46
+ formatter_class=RawDescriptionHelpFormatter
47
+ )
48
+ _, args = parser.parse_known_args()
49
+ clauses = []
50
+ for i in range(0, len(args), 2):
51
+ assert args[i][:2] == "--", "please start args with --"
52
+ clauses.append({args[i][2:]: args[i+1]})
53
+ print(f"cmdline clauses: {clauses}")
54
+
55
+ maker = ConfigMaker(curr_cfgs)
56
+ for clu in clauses:
57
+ maker.execute_clause(clu)
58
+
59
+ final = maker.state.copy()
60
+ return final
61
+
62
+
63
+ def _dict_to_yaml(arg):
64
+ return yaml.safe_dump(arg, sort_keys=False, allow_unicode=True)
65
+
66
+
67
+ def dispatch(module):
68
+ cfg = optional_load_config()
69
+ cfg = module(**cfg).dict()
70
+
71
+ cfg = argparse_cfg_template(cfg) # cmdline takes priority
72
+ mod = module(**cfg)
73
+
74
+ write_full_config(mod)
75
+
76
+ mod.run()
77
+
78
+
79
+ # below are some support tools
80
+
81
+
82
+ class ConfigMaker():
83
+ CMD = namedtuple('cmd', field_names=['sub', 'verb', 'objs'])
84
+ VERBS = ('add', 'replace', 'del')
85
+
86
+ def __init__(self, base_node):
87
+ self.state = base_node
88
+ self.clauses = []
89
+
90
+ def clone(self):
91
+ return deepcopy(self)
92
+
93
+ def execute_clause(self, raw_clause):
94
+ cls = self.__class__
95
+ assert isinstance(raw_clause, (str, dict))
96
+ if isinstance(raw_clause, dict):
97
+ assert len(raw_clause) == 1, \
98
+ "a clause can only have 1 statement: {} clauses in {}".format(
99
+ len(raw_clause), raw_clause
100
+ )
101
+ cmd = list(raw_clause.keys())[0]
102
+ arg = raw_clause[cmd]
103
+ else:
104
+ cmd = raw_clause
105
+ arg = None
106
+ cmd = self.parse_clause_cmd(cmd)
107
+ tracer = NodeTracer(self.state)
108
+ tracer.advance_pointer(path=cmd.sub)
109
+ if cmd.verb == cls.VERBS[0]:
110
+ tracer.add(cmd.objs, arg)
111
+ elif cmd.verb == cls.VERBS[1]:
112
+ tracer.replace(cmd.objs, arg)
113
+ elif cmd.verb == cls.VERBS[2]:
114
+ assert isinstance(raw_clause, str)
115
+ tracer.delete(cmd.objs)
116
+ self.state = tracer.state
117
+
118
+ @classmethod
119
+ def parse_clause_cmd(cls, input):
120
+ """
121
+ Args:
122
+ input: a string to be parsed
123
+ 1. First test whether a verb is present
124
+ 2. If not present, then str is a single subject, and verb is replace
125
+ This is a syntactical sugar that makes writing config easy
126
+ 3. If a verb is found, whatever comes before is a subject, and after the
127
+ objects.
128
+ 4. Handle the edge cases properly. Below are expected parse outputs
129
+ input sub verb obj
130
+ --- No verb
131
+ '' '' replace []
132
+ 'a.b' 'a.b' replace []
133
+ 'add' '' add []
134
+ 'P Q' err: 2 subjects
135
+ --- Verb present
136
+ 'T add' 'T' add []
137
+ 'T del a b' 'T' del [a, b]
138
+ 'P Q add a' err: 2 subjects
139
+ 'P add del b' err: 2 verbs
140
+ """
141
+ assert isinstance(input, str)
142
+ input = input.split()
143
+ objs = []
144
+ sub = ''
145
+ verb, verb_inx = cls.scan_for_verb(input)
146
+ if verb is None:
147
+ assert len(input) <= 1, "no verb present; more than 1 subject: {}"\
148
+ .format(input)
149
+ sub = input[0] if len(input) == 1 else ''
150
+ verb = cls.VERBS[1]
151
+ else:
152
+ assert not verb_inx > 1, 'verb {} at inx {}; more than 1 subject in: {}'\
153
+ .format(verb, verb_inx, input)
154
+ sub = input[0] if verb_inx == 1 else ''
155
+ objs = input[verb_inx + 1:]
156
+ cmd = cls.CMD(sub=sub, verb=verb, objs=objs)
157
+ return cmd
158
+
159
+ @classmethod
160
+ def scan_for_verb(cls, input_list):
161
+ assert isinstance(input_list, list)
162
+ counts = [ input_list.count(v) for v in cls.VERBS ]
163
+ presence = [ cnt > 0 for cnt in counts ]
164
+ if sum(presence) == 0:
165
+ return None, -1
166
+ elif sum(presence) > 1:
167
+ raise ValueError("multiple verbs discovered in {}".format(input_list))
168
+
169
+ if max(counts) > 1:
170
+ raise ValueError("verbs repeated in cmd: {}".format(input_list))
171
+ # by now, there is 1 verb that has occured exactly 1 time
172
+ verb = cls.VERBS[presence.index(1)]
173
+ inx = input_list.index(verb)
174
+ return verb, inx
175
+
176
+
177
+ class NodeTracer():
178
+ def __init__(self, src_node):
179
+ """
180
+ A src node can be either a list or dict
181
+ """
182
+ assert isinstance(src_node, (list, dict))
183
+
184
+ # these are movable pointers
185
+ self.child_token = "_" # init token can be anything
186
+ self.parent = {self.child_token: src_node}
187
+
188
+ # these are permanent pointers at the root
189
+ self.root_child_token = self.child_token
190
+ self.root = self.parent
191
+
192
+ @property
193
+ def state(self):
194
+ return self.root[self.root_child_token]
195
+
196
+ @property
197
+ def pointed(self):
198
+ return self.parent[self.child_token]
199
+
200
+ def advance_pointer(self, path):
201
+ if len(path) == 0:
202
+ return
203
+ path_list = list(
204
+ map(lambda x: int(x) if str.isdigit(x) else x, path.split('.'))
205
+ )
206
+
207
+ for i, token in enumerate(path_list):
208
+ self.parent = self.pointed
209
+ self.child_token = token
210
+ try:
211
+ self.pointed
212
+ except (IndexError, KeyError):
213
+ raise ValueError(
214
+ "During the tracing of {}, {}-th token '{}'"
215
+ " is not present in node {}".format(
216
+ path, i, self.child_token, self.state
217
+ )
218
+ )
219
+
220
+ def replace(self, objs, arg):
221
+ assert len(objs) == 0
222
+ val_type = type(self.parent[self.child_token])
223
+ # this is such an unfortunate hack
224
+ # turn everything to string, so that eval could work
225
+ # some of the clauses come from cmdline, some from yaml files for sow.
226
+ arg = str(arg)
227
+ if val_type == str:
228
+ pass
229
+ else:
230
+ arg = eval(arg)
231
+ assert type(arg) == val_type, \
232
+ f"require {val_type.__name__}, given {type(arg).__name__}"
233
+
234
+ self.parent[self.child_token] = arg
my/registry.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from detectron2
2
+ from typing import Any, Dict, Iterable, Iterator, Tuple
3
+ from tabulate import tabulate
4
+
5
+
6
+ class Registry(Iterable[Tuple[str, Any]]):
7
+ def __init__(self, name: str) -> None:
8
+ """
9
+ Args:
10
+ name (str): the name of this registry
11
+ """
12
+ self._name: str = name
13
+ self._obj_map: Dict[str, Any] = {}
14
+
15
+ def _do_register(self, name: str, obj: Any) -> None:
16
+ assert (
17
+ name not in self._obj_map
18
+ ), "An object named '{}' was already registered in '{}' registry!".format(
19
+ name, self._name
20
+ )
21
+ self._obj_map[name] = obj
22
+
23
+ def register(self, obj: Any = None) -> Any:
24
+ """
25
+ Register the given object under the the name `obj.__name__`.
26
+ Can be used as either a decorator or not. See docstring of this class for usage.
27
+ """
28
+ if obj is None:
29
+ # used as a decorator
30
+ def deco(func_or_class: Any) -> Any:
31
+ name = func_or_class.__name__
32
+ self._do_register(name, func_or_class)
33
+ return func_or_class
34
+
35
+ return deco
36
+
37
+ # used as a function call
38
+ name = obj.__name__
39
+ self._do_register(name, obj)
40
+
41
+ def get(self, name: str) -> Any:
42
+ ret = self._obj_map.get(name)
43
+ if ret is None:
44
+ raise KeyError(
45
+ "No object named '{}' found in '{}' registry!".format(name, self._name)
46
+ )
47
+ return ret
48
+
49
+ def __contains__(self, name: str) -> bool:
50
+ return name in self._obj_map
51
+
52
+ def __repr__(self) -> str:
53
+ table_headers = ["Names", "Objects"]
54
+ table = tabulate(
55
+ self._obj_map.items(), headers=table_headers, tablefmt="fancy_grid"
56
+ )
57
+ return "Registry of {}:\n".format(self._name) + table
58
+
59
+ def __iter__(self) -> Iterator[Tuple[str, Any]]:
60
+ return iter(self._obj_map.items())
61
+
62
+ __str__ = __repr__
my/utils/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .event import EventStorage, get_event_storage, read_stats
2
+ from .tqdm import tqdm
3
+ from .heartbeat import HeartBeat, get_heartbeat
4
+ from .debug import EarlyLoopBreak
my/utils/debug.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ class EarlyLoopBreak():
4
+ def __init__(self, break_at: int):
5
+ self.iter = 0
6
+ self.break_at = break_at
7
+ self.on = bool(os.environ.get("EBREAK"))
8
+
9
+ def on_break(self):
10
+ if not self.on:
11
+ return
12
+
13
+ self.iter += 1
14
+ if self.break_at > 0 and self.iter >= self.break_at:
15
+ return True
my/utils/event.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # design inspiration from detectron2
2
+ from pathlib import Path
3
+ import json
4
+ import os
5
+ from contextlib import contextmanager
6
+ from .ticker import IntervalTicker
7
+
8
+
9
+ _CURRENT_STORAGE_STACK = []
10
+
11
+
12
+ def get_event_storage():
13
+ """
14
+ Returns:
15
+ The :class:`EventStorage` object that's currently being used.
16
+ Throws an error if no :class:`EventStorage` is currently enabled.
17
+ """
18
+ assert len(
19
+ _CURRENT_STORAGE_STACK
20
+ ), "get_event_storage() has to be called inside a 'with EventStorage(...)' context!"
21
+ return _CURRENT_STORAGE_STACK[-1]
22
+
23
+
24
+ def read_lined_json(fname):
25
+ with Path(fname).open('r') as f:
26
+ for line in f:
27
+ item = json.loads(line)
28
+ yield item
29
+
30
+
31
+ def read_stats(dirname, key):
32
+ if dirname is None or not (fname := Path(dirname) / "history.json").is_file():
33
+ return [], []
34
+ stats = read_lined_json(fname)
35
+ stats = list(filter(lambda x: key in x, stats))
36
+ xs = [e['iter'] for e in stats]
37
+ ys = [e[key] for e in stats]
38
+ return xs, ys
39
+
40
+
41
+ class EventStorage():
42
+ def __init__(self, output_dir="./", start_iter=0, flush_period=60):
43
+ self.iter = start_iter
44
+ self.ticker = IntervalTicker(flush_period)
45
+ self.history = []
46
+ self._current_prefix = ""
47
+ self._init_curr_buffer_()
48
+
49
+ self.output_dir = output_dir
50
+ self.writable = False
51
+
52
+ def _open(self):
53
+ if self.writable:
54
+ output_dir = Path(self.output_dir)
55
+ if not output_dir.is_dir():
56
+ output_dir.mkdir(parents=True, exist_ok=True)
57
+ json_fname = output_dir / 'history.json'
58
+
59
+ self._file_handle = json_fname.open('a', encoding='utf8')
60
+ self.output_dir = output_dir # make sure it's a path object
61
+
62
+ def _init_curr_buffer_(self):
63
+ self.curr_buffer = {'iter': self.iter}
64
+
65
+ def step(self, flush=False):
66
+ self.history.append(self.curr_buffer)
67
+
68
+ on_flush_period = self.ticker.tick()
69
+ if flush or on_flush_period:
70
+ self.flush_history()
71
+
72
+ self.iter += 1
73
+ self._init_curr_buffer_()
74
+
75
+ def flush_history(self):
76
+ if self.writable:
77
+ for item in self.history:
78
+ line = json.dumps(item, sort_keys=True, ensure_ascii=False) + "\n"
79
+ self._file_handle.write(line)
80
+ self._file_handle.flush()
81
+ self.history = []
82
+
83
+ def full_key(self, key):
84
+ assert isinstance(key, str)
85
+ name = self._current_prefix + key
86
+ return name
87
+
88
+ def put(self, key, val):
89
+ key = self.full_key(key)
90
+ assert isinstance(val, (int, float, str))
91
+ if isinstance(val, float):
92
+ val = round(val, 3)
93
+ self.curr_buffer[key] = val
94
+
95
+ def put_scalars(self, **kwargs):
96
+ for k, v in kwargs.items():
97
+ self.put(k, v)
98
+
99
+ def put_artifact(self, key, ext, save_func):
100
+ if not self.writable:
101
+ return
102
+ os.makedirs(self.output_dir / key, exist_ok=True)
103
+ fname = (self.output_dir / key / f"step_{self.iter}").with_suffix(ext)
104
+ fname = str(fname)
105
+
106
+ # must be called inside so that
107
+ # 1. the func is not executed if the metric is not writable
108
+ # 2. the key is only inserted if the func succeeds
109
+ save_func(fname)
110
+ self.put(key, fname)
111
+ return fname
112
+
113
+ def close(self):
114
+ self.flush_history()
115
+ if self.writable:
116
+ self._file_handle.close()
117
+
118
+ def get_last(self):
119
+ if len(self.history) > 0:
120
+ last = self.history[-1]
121
+ return last
122
+
123
+ def __enter__(self):
124
+ if len(_CURRENT_STORAGE_STACK) > 0:
125
+ parent = _CURRENT_STORAGE_STACK[-1]
126
+ root, dirname = parent.output_dir, self.output_dir
127
+ if root is not None and dirname is not None:
128
+ child_dir = parent.output_dir / f"{self.output_dir}_{parent.iter}"
129
+ self.output_dir = child_dir
130
+ parent.put(str(dirname), str(child_dir))
131
+
132
+ if self.output_dir is not None:
133
+ self.writable = True
134
+ self._open()
135
+
136
+ _CURRENT_STORAGE_STACK.append(self)
137
+ return self
138
+
139
+ def __exit__(self, exc_type, exc_val, exc_tb):
140
+ assert _CURRENT_STORAGE_STACK[-1] == self
141
+ _CURRENT_STORAGE_STACK.pop()
142
+ self.close()
my/utils/heartbeat.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # generates periodic hearbeats for remote expriment monitoring
2
+ from pathlib import Path
3
+ import json
4
+ from inspect import stack
5
+ from .ticker import IntervalTicker
6
+
7
+ _CURRENT_BEAT_STACK = []
8
+
9
+
10
+ def get_heartbeat():
11
+ """
12
+ Returns:
13
+ The :class:`HeartBeat` object that's currently being used.
14
+ Throws an error if no :class:`EventStorage` is currently enabled.
15
+ """
16
+ assert len(
17
+ _CURRENT_BEAT_STACK
18
+ ), "get_heartbeat() has to be called inside a 'with EventStorage(...)' context!"
19
+ return _CURRENT_BEAT_STACK[-1]
20
+
21
+
22
+ def get_tqdm_meter(pbar, format_dict):
23
+ format_dict['bar_format'] = "{r_bar}"
24
+ meter_str = pbar.format_meter(**format_dict)
25
+ meter_str = meter_str[2:]
26
+ return meter_str
27
+
28
+
29
+ def caller_info(n_stack_up):
30
+ info = stack()[1 + n_stack_up] # 1 up as base so that it starts from caller
31
+ msg = f"{info.filename}:{info.lineno} - {info.function}"
32
+ return msg
33
+
34
+
35
+ class HeartBeat():
36
+ def __init__(
37
+ self, pbar, write_interval=10,
38
+ output_dir="./", fname="heartbeat.json"
39
+ ):
40
+ self.pbar = pbar
41
+ self.fname = Path(output_dir) / fname
42
+ self.ticker = IntervalTicker(write_interval)
43
+ self.completed = False
44
+
45
+ # force one write at the beginning
46
+ self.beat(force_write=True, n_stack_up=2)
47
+
48
+ def beat(self, force_write=False, n_stack_up=1):
49
+ on_write_period = self.ticker.tick()
50
+ if force_write or on_write_period:
51
+ stats = self.stats()
52
+ stats['caller'] = caller_info(n_stack_up)
53
+
54
+ with open(self.fname, "w") as f:
55
+ json.dump(stats, f)
56
+
57
+ def done(self):
58
+ self.completed = True
59
+ self.beat(force_write=True, n_stack_up=2)
60
+
61
+ def stats(self):
62
+ pbar = self.pbar
63
+ fdict = pbar.format_dict
64
+ stats = {
65
+ "beat": self.ticker.tick_str(),
66
+ "done": self.completed,
67
+ "meter": get_tqdm_meter(pbar, fdict),
68
+ "elapsed": int(fdict['elapsed'])
69
+ }
70
+ return stats
71
+
72
+ def __enter__(self):
73
+ _CURRENT_BEAT_STACK.append(self)
74
+ return self
75
+
76
+ def __exit__(self, exc_type, exc_val, exc_tb):
77
+ assert _CURRENT_BEAT_STACK[-1] == self
78
+ _CURRENT_BEAT_STACK.pop()
my/utils/plot.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+
4
+
5
+ def mpl_fig_to_buffer(fig):
6
+ fig.canvas.draw()
7
+ plot = np.array(fig.canvas.renderer.buffer_rgba())
8
+ plt.close(fig)
9
+ return plot
my/utils/seed.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # from pytorch lightning
2
+ import random
3
+ import numpy as np
4
+ import torch
5
+
6
+ max_seed_value = np.iinfo(np.uint32).max
7
+ min_seed_value = np.iinfo(np.uint32).min
8
+
9
+
10
+ def seed_everything(seed=None):
11
+ seed = int(seed)
12
+
13
+ if not (min_seed_value <= seed <= max_seed_value):
14
+ raise ValueError(f"{seed} is not in bounds, numpy accepts from {min_seed_value} to {max_seed_value}")
15
+
16
+ print(f"seed set to {seed}")
17
+ random.seed(seed)
18
+ np.random.seed(seed)
19
+ torch.manual_seed(seed)
20
+ torch.cuda.manual_seed_all(seed)
21
+ return seed
my/utils/ticker.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import date, time, datetime, timedelta
2
+ from time import sleep
3
+
4
+
5
+ class IntervalTicker():
6
+ def __init__(self, interval=60):
7
+ self.interval = timedelta(seconds=interval)
8
+ self.last_tick = datetime.now()
9
+ self.now = self.last_tick
10
+
11
+ def tick(self):
12
+ self.now = datetime.now()
13
+ if (self.now - self.last_tick) > self.interval:
14
+ self.last_tick = self.now
15
+ return True
16
+
17
+ def tick_str(self):
18
+ return self.now.isoformat(timespec='seconds')
my/utils/tqdm.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from tqdm import tqdm as orig_tqdm
3
+
4
+
5
+ def tqdm(*args, **kwargs):
6
+ is_remote = bool(os.environ.get("IS_REMOTE", False))
7
+ if is_remote:
8
+ f = open(os.devnull, "w")
9
+ kwargs.update({"file": f})
10
+ return orig_tqdm(*args, **kwargs)
my3d.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # some tools developed for the vision class
2
+ import numpy as np
3
+ from numpy import cross, tan
4
+ from numpy.linalg import norm, inv
5
+
6
+
7
+ def normalize(v):
8
+ return v / norm(v)
9
+
10
+
11
+ def camera_pose(eye, front, up):
12
+ z = normalize(-1 * front)
13
+ x = normalize(cross(up, z))
14
+ y = normalize(cross(z, x))
15
+
16
+ # convert to col vector
17
+ x = x.reshape(-1, 1)
18
+ y = y.reshape(-1, 1)
19
+ z = z.reshape(-1, 1)
20
+ eye = eye.reshape(-1, 1)
21
+
22
+ pose = np.block([
23
+ [x, y, z, eye],
24
+ [0, 0, 0, 1]
25
+ ])
26
+ return pose
27
+
28
+
29
+ def compute_extrinsics(eye, front, up):
30
+ pose = camera_pose(eye, front, up)
31
+ world_2_cam = inv(pose)
32
+ return world_2_cam
33
+
34
+
35
+ def compute_intrinsics(aspect_ratio, fov, img_height_in_pix):
36
+ # aspect ratio is w / h
37
+ ndc = compute_proj_to_normalized(aspect_ratio, fov)
38
+
39
+ # anything beyond [-1, 1] should be discarded
40
+ # this did not mention how to do z-clipping;
41
+
42
+ ndc_to_img = compute_normalized_to_img_trans(aspect_ratio, img_height_in_pix)
43
+ intrinsic = ndc_to_img @ ndc
44
+ return intrinsic
45
+
46
+
47
+ def compute_proj_to_normalized(aspect, fov):
48
+ # compared to standard OpenGL NDC intrinsic,
49
+ # this skips the 3rd row treatment on z. hence the name partial_ndc
50
+ fov_in_rad = fov / 180 * np.pi
51
+ t = tan(fov_in_rad / 2) # tan half fov
52
+ partial_ndc_intrinsic = np.array([
53
+ [1 / (t * aspect), 0, 0, 0],
54
+ [0, 1 / t, 0, 0],
55
+ [0, 0, -1, 0] # copy the negative distance for division
56
+ ])
57
+ return partial_ndc_intrinsic
58
+
59
+
60
+ def compute_normalized_to_img_trans(aspect, img_height_in_pix):
61
+ img_h = img_height_in_pix
62
+ img_w = img_height_in_pix * aspect
63
+
64
+ # note the OpenGL convention that (0, 0) sits at the center of the pixel;
65
+ # hence the extra -0.5 translation
66
+ # this is useful when you shoot rays through a pixel to the scene
67
+ ndc_to_img = np.array([
68
+ [img_w / 2, 0, img_w / 2 - 0.5],
69
+ [0, img_h / 2, img_h / 2 - 0.5],
70
+ [0, 0, 1]
71
+ ])
72
+
73
+ img_y_coord_flip = np.array([
74
+ [1, 0, 0],
75
+ [0, -1, img_h - 1], # note the -1
76
+ [0, 0, 1]
77
+ ])
78
+
79
+ # the product of the above 2 matrices is equivalent to adding
80
+ # - sign to the (1, 1) entry
81
+ # you could have simply written
82
+ # ndc_to_img = np.array([
83
+ # [img_w / 2, 0, img_w / 2 - 0.5],
84
+ # [0, -img_h / 2, img_h / 2 - 0.5],
85
+ # [0, 0, 1]
86
+ # ])
87
+
88
+ ndc_to_img = img_y_coord_flip @ ndc_to_img
89
+ return ndc_to_img
90
+
91
+
92
+ def unproject(K, pixel_coords, depth=1.0):
93
+ """sometimes also referred to as backproject
94
+ pixel_coords: [n, 2] pixel locations
95
+ depth: [n,] or [,] depth value. of a shape that is broadcastable with pix coords
96
+ """
97
+ K = K[0:3, 0:3]
98
+
99
+ pixel_coords = as_homogeneous(pixel_coords)
100
+ pixel_coords = pixel_coords.T # [2+1, n], so that mat mult is on the left
101
+
102
+ # this will give points with z = -1, which is exactly what you want since
103
+ # your camera is facing the -ve z axis
104
+ pts = inv(K) @ pixel_coords
105
+
106
+ pts = pts * depth # [3, n] * [n,] broadcast
107
+ pts = pts.T
108
+ pts = as_homogeneous(pts)
109
+ return pts
110
+
111
+
112
+ """
113
+ these two functions are changed so that they can handle arbitrary number of
114
+ dimensions >=1
115
+ """
116
+
117
+
118
+ def homogenize(pts):
119
+ # pts: [..., d], where last dim of the d is the diviser
120
+ *front, d = pts.shape
121
+ pts = pts / pts[..., -1].reshape(*front, 1)
122
+ return pts
123
+
124
+
125
+ def as_homogeneous(pts, lib=np):
126
+ # pts: [..., d]
127
+ *front, d = pts.shape
128
+ points = lib.ones((*front, d + 1))
129
+ points[..., :d] = pts
130
+ return points
131
+
132
+
133
+ def simple_point_render(pts, img_w, img_h, fov, eye, front, up):
134
+ """
135
+ pts: [N, 3]
136
+ """
137
+ canvas = np.ones((img_h, img_w, 3))
138
+
139
+ pts = as_homogeneous(pts)
140
+
141
+ E = compute_extrinsics(eye, front, up)
142
+ world_2_ndc = compute_proj_to_normalized(img_w / img_h, fov)
143
+ ndc_to_img = compute_normalized_to_img_trans(img_w / img_h, img_h)
144
+
145
+ pts = pts @ E.T
146
+ pts = pts @ world_2_ndc.T
147
+ pts = homogenize(pts)
148
+
149
+ # now filter out outliers beyond [-1, 1]
150
+ outlier_mask = (np.abs(pts) > 1.0).any(axis=1)
151
+ pts = pts[~outlier_mask]
152
+
153
+ pts = pts @ ndc_to_img.T
154
+
155
+ # now draw each point
156
+ pts = np.rint(pts).astype(np.int32)
157
+ xs, ys, _ = pts.T
158
+ canvas[ys, xs] = (1, 0, 0)
159
+
160
+ return canvas
ncsn/__init__.py ADDED
File without changes
ncsn/bedroom.yml ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ training:
2
+ batch_size: 128
3
+ n_epochs: 500000
4
+ n_iters: 150001
5
+ snapshot_freq: 5000
6
+ snapshot_sampling: true
7
+ anneal_power: 2
8
+ log_all_sigmas: false
9
+
10
+ sampling:
11
+ batch_size: 36
12
+ data_init: false
13
+ step_lr: 0.0000018
14
+ n_steps_each: 3
15
+ ckpt_id: 150000
16
+ final_only: true
17
+ fid: false
18
+ denoise: true
19
+ num_samples4fid: 10000
20
+ inpainting: false
21
+ interpolation: false
22
+ n_interpolations: 10
23
+
24
+ fast_fid:
25
+ batch_size: 1000
26
+ num_samples: 1000
27
+ step_lr: 0.0000018
28
+ n_steps_each: 3
29
+ begin_ckpt: 100000
30
+ end_ckpt: 150000
31
+ verbose: false
32
+ ensemble: false
33
+
34
+ test:
35
+ begin_ckpt: 5000
36
+ end_ckpt: 150000
37
+ batch_size: 100
38
+
39
+ data:
40
+ dataset: "LSUN"
41
+ category: "bedroom"
42
+ image_size: 128
43
+ channels: 3
44
+ logit_transform: false
45
+ uniform_dequantization: false
46
+ gaussian_dequantization: false
47
+ random_flip: true
48
+ rescaled: false
49
+ num_workers: 32
50
+
51
+ model:
52
+ sigma_begin: 190
53
+ num_classes: 1086
54
+ ema: true
55
+ ema_rate: 0.999
56
+ spec_norm: false
57
+ sigma_dist: geometric
58
+ sigma_end: 0.01
59
+ normalization: InstanceNorm++
60
+ nonlinearity: elu
61
+ ngf: 128
62
+
63
+ optim:
64
+ weight_decay: 0.000
65
+ optimizer: "Adam"
66
+ lr: 0.0001
67
+ beta1: 0.9
68
+ amsgrad: false
69
+ eps: 0.00000001
ncsn/ema.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import torch.nn as nn
3
+
4
+ class EMAHelper(object):
5
+ def __init__(self, mu=0.999):
6
+ self.mu = mu
7
+ self.shadow = {}
8
+
9
+ def register(self, module):
10
+ if isinstance(module, nn.DataParallel):
11
+ module = module.module
12
+ for name, param in module.named_parameters():
13
+ if param.requires_grad:
14
+ self.shadow[name] = param.data.clone()
15
+
16
+ def update(self, module):
17
+ if isinstance(module, nn.DataParallel):
18
+ module = module.module
19
+ for name, param in module.named_parameters():
20
+ if param.requires_grad:
21
+ self.shadow[name].data = (1. - self.mu) * param.data + self.mu * self.shadow[name].data
22
+
23
+ def ema(self, module):
24
+ if isinstance(module, nn.DataParallel):
25
+ module = module.module
26
+ for name, param in module.named_parameters():
27
+ if param.requires_grad:
28
+ param.data.copy_(self.shadow[name].data)
29
+
30
+ def ema_copy(self, module):
31
+ if isinstance(module, nn.DataParallel):
32
+ inner_module = module.module
33
+ module_copy = type(inner_module)(inner_module.config).to(inner_module.config.device)
34
+ module_copy.load_state_dict(inner_module.state_dict())
35
+ module_copy = nn.DataParallel(module_copy)
36
+ else:
37
+ module_copy = type(module)(module.config).to(module.config.device)
38
+ module_copy.load_state_dict(module.state_dict())
39
+ # module_copy = copy.deepcopy(module)
40
+ self.ema(module_copy)
41
+ return module_copy
42
+
43
+ def state_dict(self):
44
+ return self.shadow
45
+
46
+ def load_state_dict(self, state_dict):
47
+ self.shadow = state_dict
ncsn/layers.py ADDED
@@ -0,0 +1,456 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch
3
+ from torch.nn.parameter import Parameter
4
+ import torch.nn.functional as F
5
+ from .normalization import *
6
+ from functools import partial
7
+ import math
8
+ import torch.nn.init as init
9
+
10
+
11
+ def get_act(config):
12
+ if config.model.nonlinearity.lower() == 'elu':
13
+ return nn.ELU()
14
+ elif config.model.nonlinearity.lower() == 'relu':
15
+ return nn.ReLU()
16
+ elif config.model.nonlinearity.lower() == 'lrelu':
17
+ return nn.LeakyReLU(negative_slope=0.2)
18
+ elif config.model.nonlinearity.lower() == 'swish':
19
+ def swish(x):
20
+ return x * torch.sigmoid(x)
21
+ return swish
22
+ else:
23
+ raise NotImplementedError('activation function does not exist!')
24
+
25
+ def spectral_norm(layer, n_iters=1):
26
+ return torch.nn.utils.spectral_norm(layer, n_power_iterations=n_iters)
27
+
28
+ def conv1x1(in_planes, out_planes, stride=1, bias=True, spec_norm=False):
29
+ "1x1 convolution"
30
+ conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride,
31
+ padding=0, bias=bias)
32
+ if spec_norm:
33
+ conv = spectral_norm(conv)
34
+ return conv
35
+
36
+
37
+ def conv3x3(in_planes, out_planes, stride=1, bias=True, spec_norm=False):
38
+ "3x3 convolution with padding"
39
+ conv = nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
40
+ padding=1, bias=bias)
41
+ if spec_norm:
42
+ conv = spectral_norm(conv)
43
+
44
+ return conv
45
+
46
+
47
+ def stride_conv3x3(in_planes, out_planes, kernel_size, bias=True, spec_norm=False):
48
+ conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=2,
49
+ padding=kernel_size // 2, bias=bias)
50
+ if spec_norm:
51
+ conv = spectral_norm(conv)
52
+ return conv
53
+
54
+
55
+ def dilated_conv3x3(in_planes, out_planes, dilation, bias=True, spec_norm=False):
56
+ conv = nn.Conv2d(in_planes, out_planes, kernel_size=3, padding=dilation, dilation=dilation, bias=bias)
57
+ if spec_norm:
58
+ conv = spectral_norm(conv)
59
+
60
+ return conv
61
+
62
+ class CRPBlock(nn.Module):
63
+ def __init__(self, features, n_stages, act=nn.ReLU(), maxpool=True, spec_norm=False):
64
+ super().__init__()
65
+ self.convs = nn.ModuleList()
66
+ for i in range(n_stages):
67
+ self.convs.append(conv3x3(features, features, stride=1, bias=False, spec_norm=spec_norm))
68
+ self.n_stages = n_stages
69
+ if maxpool:
70
+ self.maxpool = nn.MaxPool2d(kernel_size=5, stride=1, padding=2)
71
+ else:
72
+ self.maxpool = nn.AvgPool2d(kernel_size=5, stride=1, padding=2)
73
+
74
+ self.act = act
75
+
76
+ def forward(self, x):
77
+ x = self.act(x)
78
+ path = x
79
+ for i in range(self.n_stages):
80
+ path = self.maxpool(path)
81
+ path = self.convs[i](path)
82
+ x = path + x
83
+ return x
84
+
85
+
86
+ class CondCRPBlock(nn.Module):
87
+ def __init__(self, features, n_stages, num_classes, normalizer, act=nn.ReLU(), spec_norm=False):
88
+ super().__init__()
89
+ self.convs = nn.ModuleList()
90
+ self.norms = nn.ModuleList()
91
+ self.normalizer = normalizer
92
+ for i in range(n_stages):
93
+ self.norms.append(normalizer(features, num_classes, bias=True))
94
+ self.convs.append(conv3x3(features, features, stride=1, bias=False, spec_norm=spec_norm))
95
+
96
+ self.n_stages = n_stages
97
+ self.maxpool = nn.AvgPool2d(kernel_size=5, stride=1, padding=2)
98
+ self.act = act
99
+
100
+ def forward(self, x, y):
101
+ x = self.act(x)
102
+ path = x
103
+ for i in range(self.n_stages):
104
+ path = self.norms[i](path, y)
105
+ path = self.maxpool(path)
106
+ path = self.convs[i](path)
107
+
108
+ x = path + x
109
+ return x
110
+
111
+
112
+ class RCUBlock(nn.Module):
113
+ def __init__(self, features, n_blocks, n_stages, act=nn.ReLU(), spec_norm=False):
114
+ super().__init__()
115
+
116
+ for i in range(n_blocks):
117
+ for j in range(n_stages):
118
+ setattr(self, '{}_{}_conv'.format(i + 1, j + 1), conv3x3(features, features, stride=1, bias=False,
119
+ spec_norm=spec_norm))
120
+
121
+ self.stride = 1
122
+ self.n_blocks = n_blocks
123
+ self.n_stages = n_stages
124
+ self.act = act
125
+
126
+ def forward(self, x):
127
+ for i in range(self.n_blocks):
128
+ residual = x
129
+ for j in range(self.n_stages):
130
+ x = self.act(x)
131
+ x = getattr(self, '{}_{}_conv'.format(i + 1, j + 1))(x)
132
+
133
+ x += residual
134
+ return x
135
+
136
+
137
+ class CondRCUBlock(nn.Module):
138
+ def __init__(self, features, n_blocks, n_stages, num_classes, normalizer, act=nn.ReLU(), spec_norm=False):
139
+ super().__init__()
140
+
141
+ for i in range(n_blocks):
142
+ for j in range(n_stages):
143
+ setattr(self, '{}_{}_norm'.format(i + 1, j + 1), normalizer(features, num_classes, bias=True))
144
+ setattr(self, '{}_{}_conv'.format(i + 1, j + 1),
145
+ conv3x3(features, features, stride=1, bias=False, spec_norm=spec_norm))
146
+
147
+ self.stride = 1
148
+ self.n_blocks = n_blocks
149
+ self.n_stages = n_stages
150
+ self.act = act
151
+ self.normalizer = normalizer
152
+
153
+ def forward(self, x, y):
154
+ for i in range(self.n_blocks):
155
+ residual = x
156
+ for j in range(self.n_stages):
157
+ x = getattr(self, '{}_{}_norm'.format(i + 1, j + 1))(x, y)
158
+ x = self.act(x)
159
+ x = getattr(self, '{}_{}_conv'.format(i + 1, j + 1))(x)
160
+
161
+ x += residual
162
+ return x
163
+
164
+
165
+ class MSFBlock(nn.Module):
166
+ def __init__(self, in_planes, features, spec_norm=False):
167
+ """
168
+ :param in_planes: tuples of input planes
169
+ """
170
+ super().__init__()
171
+ assert isinstance(in_planes, list) or isinstance(in_planes, tuple)
172
+ self.convs = nn.ModuleList()
173
+ self.features = features
174
+
175
+ for i in range(len(in_planes)):
176
+ self.convs.append(conv3x3(in_planes[i], features, stride=1, bias=True, spec_norm=spec_norm))
177
+
178
+ def forward(self, xs, shape):
179
+ sums = torch.zeros(xs[0].shape[0], self.features, *shape, device=xs[0].device)
180
+ for i in range(len(self.convs)):
181
+ h = self.convs[i](xs[i])
182
+ h = F.interpolate(h, size=shape, mode='bilinear', align_corners=True)
183
+ sums += h
184
+ return sums
185
+
186
+
187
+ class CondMSFBlock(nn.Module):
188
+ def __init__(self, in_planes, features, num_classes, normalizer, spec_norm=False):
189
+ """
190
+ :param in_planes: tuples of input planes
191
+ """
192
+ super().__init__()
193
+ assert isinstance(in_planes, list) or isinstance(in_planes, tuple)
194
+
195
+ self.convs = nn.ModuleList()
196
+ self.norms = nn.ModuleList()
197
+ self.features = features
198
+ self.normalizer = normalizer
199
+
200
+ for i in range(len(in_planes)):
201
+ self.convs.append(conv3x3(in_planes[i], features, stride=1, bias=True, spec_norm=spec_norm))
202
+ self.norms.append(normalizer(in_planes[i], num_classes, bias=True))
203
+
204
+ def forward(self, xs, y, shape):
205
+ sums = torch.zeros(xs[0].shape[0], self.features, *shape, device=xs[0].device)
206
+ for i in range(len(self.convs)):
207
+ h = self.norms[i](xs[i], y)
208
+ h = self.convs[i](h)
209
+ h = F.interpolate(h, size=shape, mode='bilinear', align_corners=True)
210
+ sums += h
211
+ return sums
212
+
213
+
214
+ class RefineBlock(nn.Module):
215
+ def __init__(self, in_planes, features, act=nn.ReLU(), start=False, end=False, maxpool=True, spec_norm=False):
216
+ super().__init__()
217
+
218
+ assert isinstance(in_planes, tuple) or isinstance(in_planes, list)
219
+ self.n_blocks = n_blocks = len(in_planes)
220
+
221
+ self.adapt_convs = nn.ModuleList()
222
+ for i in range(n_blocks):
223
+ self.adapt_convs.append(
224
+ RCUBlock(in_planes[i], 2, 2, act, spec_norm=spec_norm)
225
+ )
226
+
227
+ self.output_convs = RCUBlock(features, 3 if end else 1, 2, act, spec_norm=spec_norm)
228
+
229
+ if not start:
230
+ self.msf = MSFBlock(in_planes, features, spec_norm=spec_norm)
231
+
232
+ self.crp = CRPBlock(features, 2, act, maxpool=maxpool, spec_norm=spec_norm)
233
+
234
+ def forward(self, xs, output_shape):
235
+ assert isinstance(xs, tuple) or isinstance(xs, list)
236
+ hs = []
237
+ for i in range(len(xs)):
238
+ h = self.adapt_convs[i](xs[i])
239
+ hs.append(h)
240
+
241
+ if self.n_blocks > 1:
242
+ h = self.msf(hs, output_shape)
243
+ else:
244
+ h = hs[0]
245
+
246
+ h = self.crp(h)
247
+ h = self.output_convs(h)
248
+
249
+ return h
250
+
251
+
252
+
253
+ class CondRefineBlock(nn.Module):
254
+ def __init__(self, in_planes, features, num_classes, normalizer, act=nn.ReLU(), start=False, end=False, spec_norm=False):
255
+ super().__init__()
256
+
257
+ assert isinstance(in_planes, tuple) or isinstance(in_planes, list)
258
+ self.n_blocks = n_blocks = len(in_planes)
259
+
260
+ self.adapt_convs = nn.ModuleList()
261
+ for i in range(n_blocks):
262
+ self.adapt_convs.append(
263
+ CondRCUBlock(in_planes[i], 2, 2, num_classes, normalizer, act, spec_norm=spec_norm)
264
+ )
265
+
266
+ self.output_convs = CondRCUBlock(features, 3 if end else 1, 2, num_classes, normalizer, act, spec_norm=spec_norm)
267
+
268
+ if not start:
269
+ self.msf = CondMSFBlock(in_planes, features, num_classes, normalizer, spec_norm=spec_norm)
270
+
271
+ self.crp = CondCRPBlock(features, 2, num_classes, normalizer, act, spec_norm=spec_norm)
272
+
273
+ def forward(self, xs, y, output_shape):
274
+ assert isinstance(xs, tuple) or isinstance(xs, list)
275
+ hs = []
276
+ for i in range(len(xs)):
277
+ h = self.adapt_convs[i](xs[i], y)
278
+ hs.append(h)
279
+
280
+ if self.n_blocks > 1:
281
+ h = self.msf(hs, y, output_shape)
282
+ else:
283
+ h = hs[0]
284
+
285
+ h = self.crp(h, y)
286
+ h = self.output_convs(h, y)
287
+
288
+ return h
289
+
290
+
291
+ class ConvMeanPool(nn.Module):
292
+ def __init__(self, input_dim, output_dim, kernel_size=3, biases=True, adjust_padding=False, spec_norm=False):
293
+ super().__init__()
294
+ if not adjust_padding:
295
+ conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
296
+ if spec_norm:
297
+ conv = spectral_norm(conv)
298
+ self.conv = conv
299
+ else:
300
+ conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
301
+ if spec_norm:
302
+ conv = spectral_norm(conv)
303
+
304
+ self.conv = nn.Sequential(
305
+ nn.ZeroPad2d((1, 0, 1, 0)),
306
+ conv
307
+ )
308
+
309
+ def forward(self, inputs):
310
+ output = self.conv(inputs)
311
+ output = sum([output[:, :, ::2, ::2], output[:, :, 1::2, ::2],
312
+ output[:, :, ::2, 1::2], output[:, :, 1::2, 1::2]]) / 4.
313
+ return output
314
+
315
+ class MeanPoolConv(nn.Module):
316
+ def __init__(self, input_dim, output_dim, kernel_size=3, biases=True, spec_norm=False):
317
+ super().__init__()
318
+ self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
319
+ if spec_norm:
320
+ self.conv = spectral_norm(self.conv)
321
+
322
+ def forward(self, inputs):
323
+ output = inputs
324
+ output = sum([output[:, :, ::2, ::2], output[:, :, 1::2, ::2],
325
+ output[:, :, ::2, 1::2], output[:, :, 1::2, 1::2]]) / 4.
326
+ return self.conv(output)
327
+
328
+
329
+ class UpsampleConv(nn.Module):
330
+ def __init__(self, input_dim, output_dim, kernel_size=3, biases=True, spec_norm=False):
331
+ super().__init__()
332
+ self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
333
+ if spec_norm:
334
+ self.conv = spectral_norm(self.conv)
335
+ self.pixelshuffle = nn.PixelShuffle(upscale_factor=2)
336
+
337
+ def forward(self, inputs):
338
+ output = inputs
339
+ output = torch.cat([output, output, output, output], dim=1)
340
+ output = self.pixelshuffle(output)
341
+ return self.conv(output)
342
+
343
+
344
+ class ConditionalResidualBlock(nn.Module):
345
+ def __init__(self, input_dim, output_dim, num_classes, resample=None, act=nn.ELU(),
346
+ normalization=ConditionalBatchNorm2d, adjust_padding=False, dilation=None, spec_norm=False):
347
+ super().__init__()
348
+ self.non_linearity = act
349
+ self.input_dim = input_dim
350
+ self.output_dim = output_dim
351
+ self.resample = resample
352
+ self.normalization = normalization
353
+ if resample == 'down':
354
+ if dilation is not None:
355
+ self.conv1 = dilated_conv3x3(input_dim, input_dim, dilation=dilation, spec_norm=spec_norm)
356
+ self.normalize2 = normalization(input_dim, num_classes)
357
+ self.conv2 = dilated_conv3x3(input_dim, output_dim, dilation=dilation, spec_norm=spec_norm)
358
+ conv_shortcut = partial(dilated_conv3x3, dilation=dilation, spec_norm=spec_norm)
359
+ else:
360
+ self.conv1 = conv3x3(input_dim, input_dim, spec_norm=spec_norm)
361
+ self.normalize2 = normalization(input_dim, num_classes)
362
+ self.conv2 = ConvMeanPool(input_dim, output_dim, 3, adjust_padding=adjust_padding, spec_norm=spec_norm)
363
+ conv_shortcut = partial(ConvMeanPool, kernel_size=1, adjust_padding=adjust_padding, spec_norm=spec_norm)
364
+
365
+ elif resample is None:
366
+ if dilation is not None:
367
+ conv_shortcut = partial(dilated_conv3x3, dilation=dilation, spec_norm=spec_norm)
368
+ self.conv1 = dilated_conv3x3(input_dim, output_dim, dilation=dilation, spec_norm=spec_norm)
369
+ self.normalize2 = normalization(output_dim, num_classes)
370
+ self.conv2 = dilated_conv3x3(output_dim, output_dim, dilation=dilation, spec_norm=spec_norm)
371
+ else:
372
+ conv_shortcut = nn.Conv2d
373
+ self.conv1 = conv3x3(input_dim, output_dim, spec_norm=spec_norm)
374
+ self.normalize2 = normalization(output_dim, num_classes)
375
+ self.conv2 = conv3x3(output_dim, output_dim, spec_norm=spec_norm)
376
+ else:
377
+ raise Exception('invalid resample value')
378
+
379
+ if output_dim != input_dim or resample is not None:
380
+ self.shortcut = conv_shortcut(input_dim, output_dim)
381
+
382
+ self.normalize1 = normalization(input_dim, num_classes)
383
+
384
+
385
+ def forward(self, x, y):
386
+ output = self.normalize1(x, y)
387
+ output = self.non_linearity(output)
388
+ output = self.conv1(output)
389
+ output = self.normalize2(output, y)
390
+ output = self.non_linearity(output)
391
+ output = self.conv2(output)
392
+
393
+ if self.output_dim == self.input_dim and self.resample is None:
394
+ shortcut = x
395
+ else:
396
+ shortcut = self.shortcut(x)
397
+
398
+ return shortcut + output
399
+
400
+
401
+ class ResidualBlock(nn.Module):
402
+ def __init__(self, input_dim, output_dim, resample=None, act=nn.ELU(),
403
+ normalization=nn.BatchNorm2d, adjust_padding=False, dilation=None, spec_norm=False):
404
+ super().__init__()
405
+ self.non_linearity = act
406
+ self.input_dim = input_dim
407
+ self.output_dim = output_dim
408
+ self.resample = resample
409
+ self.normalization = normalization
410
+ if resample == 'down':
411
+ if dilation is not None:
412
+ self.conv1 = dilated_conv3x3(input_dim, input_dim, dilation=dilation, spec_norm=spec_norm)
413
+ self.normalize2 = normalization(input_dim)
414
+ self.conv2 = dilated_conv3x3(input_dim, output_dim, dilation=dilation, spec_norm=spec_norm)
415
+ conv_shortcut = partial(dilated_conv3x3, dilation=dilation, spec_norm=spec_norm)
416
+ else:
417
+ self.conv1 = conv3x3(input_dim, input_dim, spec_norm=spec_norm)
418
+ self.normalize2 = normalization(input_dim)
419
+ self.conv2 = ConvMeanPool(input_dim, output_dim, 3, adjust_padding=adjust_padding, spec_norm=spec_norm)
420
+ conv_shortcut = partial(ConvMeanPool, kernel_size=1, adjust_padding=adjust_padding, spec_norm=spec_norm)
421
+
422
+ elif resample is None:
423
+ if dilation is not None:
424
+ conv_shortcut = partial(dilated_conv3x3, dilation=dilation, spec_norm=spec_norm)
425
+ self.conv1 = dilated_conv3x3(input_dim, output_dim, dilation=dilation, spec_norm=spec_norm)
426
+ self.normalize2 = normalization(output_dim)
427
+ self.conv2 = dilated_conv3x3(output_dim, output_dim, dilation=dilation, spec_norm=spec_norm)
428
+ else:
429
+ # conv_shortcut = nn.Conv2d ### Something wierd here.
430
+ conv_shortcut = partial(conv1x1, spec_norm=spec_norm)
431
+ self.conv1 = conv3x3(input_dim, output_dim, spec_norm=spec_norm)
432
+ self.normalize2 = normalization(output_dim)
433
+ self.conv2 = conv3x3(output_dim, output_dim, spec_norm=spec_norm)
434
+ else:
435
+ raise Exception('invalid resample value')
436
+
437
+ if output_dim != input_dim or resample is not None:
438
+ self.shortcut = conv_shortcut(input_dim, output_dim)
439
+
440
+ self.normalize1 = normalization(input_dim)
441
+
442
+
443
+ def forward(self, x):
444
+ output = self.normalize1(x)
445
+ output = self.non_linearity(output)
446
+ output = self.conv1(output)
447
+ output = self.normalize2(output)
448
+ output = self.non_linearity(output)
449
+ output = self.conv2(output)
450
+
451
+ if self.output_dim == self.input_dim and self.resample is None:
452
+ shortcut = x
453
+ else:
454
+ shortcut = self.shortcut(x)
455
+
456
+ return shortcut + output
ncsn/ncsnv2.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import numpy as np
3
+ import torch.nn.functional as F
4
+ import torch
5
+ from functools import partial
6
+ from .layers import *
7
+ from .normalization import get_normalization
8
+
9
+
10
+ def get_sigmas(config):
11
+ if config.model.sigma_dist == 'geometric':
12
+ sigmas = torch.tensor(
13
+ np.exp(np.linspace(np.log(config.model.sigma_begin), np.log(config.model.sigma_end),
14
+ config.model.num_classes))).float().to(config.device)
15
+ elif config.model.sigma_dist == 'uniform':
16
+ sigmas = torch.tensor(
17
+ np.linspace(config.model.sigma_begin, config.model.sigma_end, config.model.num_classes)
18
+ ).float().to(config.device)
19
+
20
+ else:
21
+ raise NotImplementedError('sigma distribution not supported')
22
+
23
+ return sigmas
24
+
25
+
26
+ class NCSNv2(nn.Module):
27
+ def __init__(self, config):
28
+ super().__init__()
29
+ self.logit_transform = config.data.logit_transform
30
+ self.rescaled = config.data.rescaled
31
+ self.norm = get_normalization(config, conditional=False)
32
+ self.ngf = ngf = config.model.ngf
33
+ self.num_classes = num_classes = config.model.num_classes
34
+
35
+ self.act = act = get_act(config)
36
+ self.register_buffer('sigmas', get_sigmas(config))
37
+ self.config = config
38
+
39
+ self.begin_conv = nn.Conv2d(config.data.channels, ngf, 3, stride=1, padding=1)
40
+
41
+ self.normalizer = self.norm(ngf, self.num_classes)
42
+ self.end_conv = nn.Conv2d(ngf, config.data.channels, 3, stride=1, padding=1)
43
+
44
+ self.res1 = nn.ModuleList([
45
+ ResidualBlock(self.ngf, self.ngf, resample=None, act=act,
46
+ normalization=self.norm),
47
+ ResidualBlock(self.ngf, self.ngf, resample=None, act=act,
48
+ normalization=self.norm)]
49
+ )
50
+
51
+ self.res2 = nn.ModuleList([
52
+ ResidualBlock(self.ngf, 2 * self.ngf, resample='down', act=act,
53
+ normalization=self.norm),
54
+ ResidualBlock(2 * self.ngf, 2 * self.ngf, resample=None, act=act,
55
+ normalization=self.norm)]
56
+ )
57
+
58
+ self.res3 = nn.ModuleList([
59
+ ResidualBlock(2 * self.ngf, 2 * self.ngf, resample='down', act=act,
60
+ normalization=self.norm, dilation=2),
61
+ ResidualBlock(2 * self.ngf, 2 * self.ngf, resample=None, act=act,
62
+ normalization=self.norm, dilation=2)]
63
+ )
64
+
65
+ if config.data.image_size == 28:
66
+ self.res4 = nn.ModuleList([
67
+ ResidualBlock(2 * self.ngf, 2 * self.ngf, resample='down', act=act,
68
+ normalization=self.norm, adjust_padding=True, dilation=4),
69
+ ResidualBlock(2 * self.ngf, 2 * self.ngf, resample=None, act=act,
70
+ normalization=self.norm, dilation=4)]
71
+ )
72
+ else:
73
+ self.res4 = nn.ModuleList([
74
+ ResidualBlock(2 * self.ngf, 2 * self.ngf, resample='down', act=act,
75
+ normalization=self.norm, adjust_padding=False, dilation=4),
76
+ ResidualBlock(2 * self.ngf, 2 * self.ngf, resample=None, act=act,
77
+ normalization=self.norm, dilation=4)]
78
+ )
79
+
80
+ self.refine1 = RefineBlock([2 * self.ngf], 2 * self.ngf, act=act, start=True)
81
+ self.refine2 = RefineBlock([2 * self.ngf, 2 * self.ngf], 2 * self.ngf, act=act)
82
+ self.refine3 = RefineBlock([2 * self.ngf, 2 * self.ngf], self.ngf, act=act)
83
+ self.refine4 = RefineBlock([self.ngf, self.ngf], self.ngf, act=act, end=True)
84
+
85
+ def _compute_cond_module(self, module, x):
86
+ for m in module:
87
+ x = m(x)
88
+ return x
89
+
90
+ def forward(self, x, y):
91
+ if not self.logit_transform and not self.rescaled:
92
+ h = 2 * x - 1.
93
+ else:
94
+ h = x
95
+
96
+ output = self.begin_conv(h)
97
+
98
+ layer1 = self._compute_cond_module(self.res1, output)
99
+ layer2 = self._compute_cond_module(self.res2, layer1)
100
+ layer3 = self._compute_cond_module(self.res3, layer2)
101
+ layer4 = self._compute_cond_module(self.res4, layer3)
102
+
103
+ ref1 = self.refine1([layer4], layer4.shape[2:])
104
+ ref2 = self.refine2([layer3, ref1], layer3.shape[2:])
105
+ ref3 = self.refine3([layer2, ref2], layer2.shape[2:])
106
+ output = self.refine4([layer1, ref3], layer1.shape[2:])
107
+
108
+ output = self.normalizer(output)
109
+ output = self.act(output)
110
+ output = self.end_conv(output)
111
+
112
+ used_sigmas = self.sigmas[y].view(x.shape[0], *([1] * len(x.shape[1:])))
113
+
114
+ output = output / used_sigmas
115
+
116
+ return output
117
+
118
+
119
+ class NCSNv2Deeper(nn.Module):
120
+ def __init__(self, config):
121
+ super().__init__()
122
+ self.logit_transform = config.data.logit_transform
123
+ self.rescaled = config.data.rescaled
124
+ self.norm = get_normalization(config, conditional=False)
125
+ self.ngf = ngf = config.model.ngf
126
+ self.num_classes = config.model.num_classes
127
+ self.act = act = get_act(config)
128
+ self.register_buffer('sigmas', get_sigmas(config))
129
+ self.config = config
130
+
131
+ self.begin_conv = nn.Conv2d(config.data.channels, ngf, 3, stride=1, padding=1)
132
+ self.normalizer = self.norm(ngf, self.num_classes)
133
+
134
+ self.end_conv = nn.Conv2d(ngf, config.data.channels, 3, stride=1, padding=1)
135
+
136
+ self.res1 = nn.ModuleList([
137
+ ResidualBlock(self.ngf, self.ngf, resample=None, act=act,
138
+ normalization=self.norm),
139
+ ResidualBlock(self.ngf, self.ngf, resample=None, act=act,
140
+ normalization=self.norm)]
141
+ )
142
+
143
+ self.res2 = nn.ModuleList([
144
+ ResidualBlock(self.ngf, 2 * self.ngf, resample='down', act=act,
145
+ normalization=self.norm),
146
+ ResidualBlock(2 * self.ngf, 2 * self.ngf, resample=None, act=act,
147
+ normalization=self.norm)]
148
+ )
149
+
150
+ self.res3 = nn.ModuleList([
151
+ ResidualBlock(2 * self.ngf, 2 * self.ngf, resample='down', act=act,
152
+ normalization=self.norm),
153
+ ResidualBlock(2 * self.ngf, 2 * self.ngf, resample=None, act=act,
154
+ normalization=self.norm)]
155
+ )
156
+
157
+ self.res4 = nn.ModuleList([
158
+ ResidualBlock(2 * self.ngf, 4 * self.ngf, resample='down', act=act,
159
+ normalization=self.norm, dilation=2),
160
+ ResidualBlock(4 * self.ngf, 4 * self.ngf, resample=None, act=act,
161
+ normalization=self.norm, dilation=2)]
162
+ )
163
+
164
+ self.res5 = nn.ModuleList([
165
+ ResidualBlock(4 * self.ngf, 4 * self.ngf, resample='down', act=act,
166
+ normalization=self.norm, dilation=4),
167
+ ResidualBlock(4 * self.ngf, 4 * self.ngf, resample=None, act=act,
168
+ normalization=self.norm, dilation=4)]
169
+ )
170
+
171
+ self.refine1 = RefineBlock([4 * self.ngf], 4 * self.ngf, act=act, start=True)
172
+ self.refine2 = RefineBlock([4 * self.ngf, 4 * self.ngf], 2 * self.ngf, act=act)
173
+ self.refine3 = RefineBlock([2 * self.ngf, 2 * self.ngf], 2 * self.ngf, act=act)
174
+ self.refine4 = RefineBlock([2 * self.ngf, 2 * self.ngf], self.ngf, act=act)
175
+ self.refine5 = RefineBlock([self.ngf, self.ngf], self.ngf, act=act, end=True)
176
+
177
+ def _compute_cond_module(self, module, x):
178
+ for m in module:
179
+ x = m(x)
180
+ return x
181
+
182
+ def forward(self, x, y):
183
+ if not self.logit_transform and not self.rescaled:
184
+ h = 2 * x - 1.
185
+ else:
186
+ h = x
187
+
188
+ output = self.begin_conv(h)
189
+
190
+ layer1 = self._compute_cond_module(self.res1, output)
191
+ layer2 = self._compute_cond_module(self.res2, layer1)
192
+ layer3 = self._compute_cond_module(self.res3, layer2)
193
+ layer4 = self._compute_cond_module(self.res4, layer3)
194
+ layer5 = self._compute_cond_module(self.res5, layer4)
195
+
196
+ ref1 = self.refine1([layer5], layer5.shape[2:])
197
+ ref2 = self.refine2([layer4, ref1], layer4.shape[2:])
198
+ ref3 = self.refine3([layer3, ref2], layer3.shape[2:])
199
+ ref4 = self.refine4([layer2, ref3], layer2.shape[2:])
200
+ output = self.refine5([layer1, ref4], layer1.shape[2:])
201
+
202
+ output = self.normalizer(output)
203
+ output = self.act(output)
204
+ output = self.end_conv(output)
205
+
206
+ used_sigmas = self.sigmas[y].view(x.shape[0], *([1] * len(x.shape[1:])))
207
+
208
+ output = output / used_sigmas
209
+
210
+ return output
211
+
212
+
213
+ class NCSNv2Deepest(nn.Module):
214
+ def __init__(self, config):
215
+ super().__init__()
216
+ self.logit_transform = config.data.logit_transform
217
+ self.rescaled = config.data.rescaled
218
+ self.norm = get_normalization(config, conditional=False)
219
+ self.ngf = ngf = config.model.ngf
220
+ self.num_classes = config.model.num_classes
221
+ self.act = act = get_act(config)
222
+ self.register_buffer('sigmas', get_sigmas(config))
223
+ self.config = config
224
+
225
+ self.begin_conv = nn.Conv2d(config.data.channels, ngf, 3, stride=1, padding=1)
226
+ self.normalizer = self.norm(ngf, self.num_classes)
227
+
228
+ self.end_conv = nn.Conv2d(ngf, config.data.channels, 3, stride=1, padding=1)
229
+
230
+ self.res1 = nn.ModuleList([
231
+ ResidualBlock(self.ngf, self.ngf, resample=None, act=act,
232
+ normalization=self.norm),
233
+ ResidualBlock(self.ngf, self.ngf, resample=None, act=act,
234
+ normalization=self.norm)]
235
+ )
236
+
237
+ self.res2 = nn.ModuleList([
238
+ ResidualBlock(self.ngf, 2 * self.ngf, resample='down', act=act,
239
+ normalization=self.norm),
240
+ ResidualBlock(2 * self.ngf, 2 * self.ngf, resample=None, act=act,
241
+ normalization=self.norm)]
242
+ )
243
+
244
+ self.res3 = nn.ModuleList([
245
+ ResidualBlock(2 * self.ngf, 2 * self.ngf, resample='down', act=act,
246
+ normalization=self.norm),
247
+ ResidualBlock(2 * self.ngf, 2 * self.ngf, resample=None, act=act,
248
+ normalization=self.norm)]
249
+ )
250
+
251
+ self.res31 = nn.ModuleList([
252
+ ResidualBlock(2 * self.ngf, 2 * self.ngf, resample='down', act=act,
253
+ normalization=self.norm),
254
+ ResidualBlock(2 * self.ngf, 2 * self.ngf, resample=None, act=act,
255
+ normalization=self.norm)]
256
+ )
257
+
258
+ self.res4 = nn.ModuleList([
259
+ ResidualBlock(2 * self.ngf, 4 * self.ngf, resample='down', act=act,
260
+ normalization=self.norm, dilation=2),
261
+ ResidualBlock(4 * self.ngf, 4 * self.ngf, resample=None, act=act,
262
+ normalization=self.norm, dilation=2)]
263
+ )
264
+
265
+ self.res5 = nn.ModuleList([
266
+ ResidualBlock(4 * self.ngf, 4 * self.ngf, resample='down', act=act,
267
+ normalization=self.norm, dilation=4),
268
+ ResidualBlock(4 * self.ngf, 4 * self.ngf, resample=None, act=act,
269
+ normalization=self.norm, dilation=4)]
270
+ )
271
+
272
+ self.refine1 = RefineBlock([4 * self.ngf], 4 * self.ngf, act=act, start=True)
273
+ self.refine2 = RefineBlock([4 * self.ngf, 4 * self.ngf], 2 * self.ngf, act=act)
274
+ self.refine3 = RefineBlock([2 * self.ngf, 2 * self.ngf], 2 * self.ngf, act=act)
275
+ self.refine31 = RefineBlock([2 * self.ngf, 2 * self.ngf], 2 * self.ngf, act=act)
276
+ self.refine4 = RefineBlock([2 * self.ngf, 2 * self.ngf], self.ngf, act=act)
277
+ self.refine5 = RefineBlock([self.ngf, self.ngf], self.ngf, act=act, end=True)
278
+
279
+ def _compute_cond_module(self, module, x):
280
+ for m in module:
281
+ x = m(x)
282
+ return x
283
+
284
+ def forward(self, x, y):
285
+ if not self.logit_transform and not self.rescaled:
286
+ h = 2 * x - 1.
287
+ else:
288
+ h = x
289
+
290
+ output = self.begin_conv(h)
291
+
292
+ layer1 = self._compute_cond_module(self.res1, output)
293
+ layer2 = self._compute_cond_module(self.res2, layer1)
294
+ layer3 = self._compute_cond_module(self.res3, layer2)
295
+ layer31 = self._compute_cond_module(self.res31, layer3)
296
+ layer4 = self._compute_cond_module(self.res4, layer31)
297
+ layer5 = self._compute_cond_module(self.res5, layer4)
298
+
299
+ ref1 = self.refine1([layer5], layer5.shape[2:])
300
+ ref2 = self.refine2([layer4, ref1], layer4.shape[2:])
301
+ ref31 = self.refine31([layer31, ref2], layer31.shape[2:])
302
+ ref3 = self.refine3([layer3, ref31], layer3.shape[2:])
303
+ ref4 = self.refine4([layer2, ref3], layer2.shape[2:])
304
+ output = self.refine5([layer1, ref4], layer1.shape[2:])
305
+
306
+ output = self.normalizer(output)
307
+ output = self.act(output)
308
+ output = self.end_conv(output)
309
+
310
+ used_sigmas = self.sigmas[y].view(x.shape[0], *([1] * len(x.shape[1:])))
311
+
312
+ output = output / used_sigmas
313
+
314
+ return output
ncsn/normalization.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+
5
+ def get_normalization(config, conditional=True):
6
+ norm = config.model.normalization
7
+ if conditional:
8
+ if norm == 'NoneNorm':
9
+ return ConditionalNoneNorm2d
10
+ elif norm == 'InstanceNorm++':
11
+ return ConditionalInstanceNorm2dPlus
12
+ elif norm == 'InstanceNorm':
13
+ return ConditionalInstanceNorm2d
14
+ elif norm == 'BatchNorm':
15
+ return ConditionalBatchNorm2d
16
+ elif norm == 'VarianceNorm':
17
+ return ConditionalVarianceNorm2d
18
+ else:
19
+ raise NotImplementedError("{} does not exist!".format(norm))
20
+ else:
21
+ if norm == 'BatchNorm':
22
+ return nn.BatchNorm2d
23
+ elif norm == 'InstanceNorm':
24
+ return nn.InstanceNorm2d
25
+ elif norm == 'InstanceNorm++':
26
+ return InstanceNorm2dPlus
27
+ elif norm == 'VarianceNorm':
28
+ return VarianceNorm2d
29
+ elif norm == 'NoneNorm':
30
+ return NoneNorm2d
31
+ elif norm is None:
32
+ return None
33
+ else:
34
+ raise NotImplementedError("{} does not exist!".format(norm))
35
+
36
+ class ConditionalBatchNorm2d(nn.Module):
37
+ def __init__(self, num_features, num_classes, bias=True):
38
+ super().__init__()
39
+ self.num_features = num_features
40
+ self.bias = bias
41
+ self.bn = nn.BatchNorm2d(num_features, affine=False)
42
+ if self.bias:
43
+ self.embed = nn.Embedding(num_classes, num_features * 2)
44
+ self.embed.weight.data[:, :num_features].uniform_() # Initialise scale at N(1, 0.02)
45
+ self.embed.weight.data[:, num_features:].zero_() # Initialise bias at 0
46
+ else:
47
+ self.embed = nn.Embedding(num_classes, num_features)
48
+ self.embed.weight.data.uniform_()
49
+
50
+ def forward(self, x, y):
51
+ out = self.bn(x)
52
+ if self.bias:
53
+ gamma, beta = self.embed(y).chunk(2, dim=1)
54
+ out = gamma.view(-1, self.num_features, 1, 1) * out + beta.view(-1, self.num_features, 1, 1)
55
+ else:
56
+ gamma = self.embed(y)
57
+ out = gamma.view(-1, self.num_features, 1, 1) * out
58
+ return out
59
+
60
+
61
+ class ConditionalInstanceNorm2d(nn.Module):
62
+ def __init__(self, num_features, num_classes, bias=True):
63
+ super().__init__()
64
+ self.num_features = num_features
65
+ self.bias = bias
66
+ self.instance_norm = nn.InstanceNorm2d(num_features, affine=False, track_running_stats=False)
67
+ if bias:
68
+ self.embed = nn.Embedding(num_classes, num_features * 2)
69
+ self.embed.weight.data[:, :num_features].uniform_() # Initialise scale at N(1, 0.02)
70
+ self.embed.weight.data[:, num_features:].zero_() # Initialise bias at 0
71
+ else:
72
+ self.embed = nn.Embedding(num_classes, num_features)
73
+ self.embed.weight.data.uniform_()
74
+
75
+ def forward(self, x, y):
76
+ h = self.instance_norm(x)
77
+ if self.bias:
78
+ gamma, beta = self.embed(y).chunk(2, dim=-1)
79
+ out = gamma.view(-1, self.num_features, 1, 1) * h + beta.view(-1, self.num_features, 1, 1)
80
+ else:
81
+ gamma = self.embed(y)
82
+ out = gamma.view(-1, self.num_features, 1, 1) * h
83
+ return out
84
+
85
+
86
+ class ConditionalVarianceNorm2d(nn.Module):
87
+ def __init__(self, num_features, num_classes, bias=False):
88
+ super().__init__()
89
+ self.num_features = num_features
90
+ self.bias = bias
91
+ self.embed = nn.Embedding(num_classes, num_features)
92
+ self.embed.weight.data.normal_(1, 0.02)
93
+
94
+ def forward(self, x, y):
95
+ vars = torch.var(x, dim=(2, 3), keepdim=True)
96
+ h = x / torch.sqrt(vars + 1e-5)
97
+
98
+ gamma = self.embed(y)
99
+ out = gamma.view(-1, self.num_features, 1, 1) * h
100
+ return out
101
+
102
+
103
+ class VarianceNorm2d(nn.Module):
104
+ def __init__(self, num_features, bias=False):
105
+ super().__init__()
106
+ self.num_features = num_features
107
+ self.bias = bias
108
+ self.alpha = nn.Parameter(torch.zeros(num_features))
109
+ self.alpha.data.normal_(1, 0.02)
110
+
111
+ def forward(self, x):
112
+ vars = torch.var(x, dim=(2, 3), keepdim=True)
113
+ h = x / torch.sqrt(vars + 1e-5)
114
+
115
+ out = self.alpha.view(-1, self.num_features, 1, 1) * h
116
+ return out
117
+
118
+
119
+ class ConditionalNoneNorm2d(nn.Module):
120
+ def __init__(self, num_features, num_classes, bias=True):
121
+ super().__init__()
122
+ self.num_features = num_features
123
+ self.bias = bias
124
+ if bias:
125
+ self.embed = nn.Embedding(num_classes, num_features * 2)
126
+ self.embed.weight.data[:, :num_features].uniform_() # Initialise scale at N(1, 0.02)
127
+ self.embed.weight.data[:, num_features:].zero_() # Initialise bias at 0
128
+ else:
129
+ self.embed = nn.Embedding(num_classes, num_features)
130
+ self.embed.weight.data.uniform_()
131
+
132
+ def forward(self, x, y):
133
+ if self.bias:
134
+ gamma, beta = self.embed(y).chunk(2, dim=-1)
135
+ out = gamma.view(-1, self.num_features, 1, 1) * x + beta.view(-1, self.num_features, 1, 1)
136
+ else:
137
+ gamma = self.embed(y)
138
+ out = gamma.view(-1, self.num_features, 1, 1) * x
139
+ return out
140
+
141
+
142
+ class NoneNorm2d(nn.Module):
143
+ def __init__(self, num_features, bias=True):
144
+ super().__init__()
145
+
146
+ def forward(self, x):
147
+ return x
148
+
149
+
150
+ class InstanceNorm2dPlus(nn.Module):
151
+ def __init__(self, num_features, bias=True):
152
+ super().__init__()
153
+ self.num_features = num_features
154
+ self.bias = bias
155
+ self.instance_norm = nn.InstanceNorm2d(num_features, affine=False, track_running_stats=False)
156
+ self.alpha = nn.Parameter(torch.zeros(num_features))
157
+ self.gamma = nn.Parameter(torch.zeros(num_features))
158
+ self.alpha.data.normal_(1, 0.02)
159
+ self.gamma.data.normal_(1, 0.02)
160
+ if bias:
161
+ self.beta = nn.Parameter(torch.zeros(num_features))
162
+
163
+ def forward(self, x):
164
+ means = torch.mean(x, dim=(2, 3))
165
+ m = torch.mean(means, dim=-1, keepdim=True)
166
+ v = torch.var(means, dim=-1, keepdim=True)
167
+ means = (means - m) / (torch.sqrt(v + 1e-5))
168
+ h = self.instance_norm(x)
169
+
170
+ if self.bias:
171
+ h = h + means[..., None, None] * self.alpha[..., None, None]
172
+ out = self.gamma.view(-1, self.num_features, 1, 1) * h + self.beta.view(-1, self.num_features, 1, 1)
173
+ else:
174
+ h = h + means[..., None, None] * self.alpha[..., None, None]
175
+ out = self.gamma.view(-1, self.num_features, 1, 1) * h
176
+ return out
177
+
178
+
179
+ class ConditionalInstanceNorm2dPlus(nn.Module):
180
+ def __init__(self, num_features, num_classes, bias=True):
181
+ super().__init__()
182
+ self.num_features = num_features
183
+ self.bias = bias
184
+ self.instance_norm = nn.InstanceNorm2d(num_features, affine=False, track_running_stats=False)
185
+ if bias:
186
+ self.embed = nn.Embedding(num_classes, num_features * 3)
187
+ self.embed.weight.data[:, :2 * num_features].normal_(1, 0.02) # Initialise scale at N(1, 0.02)
188
+ self.embed.weight.data[:, 2 * num_features:].zero_() # Initialise bias at 0
189
+ else:
190
+ self.embed = nn.Embedding(num_classes, 2 * num_features)
191
+ self.embed.weight.data.normal_(1, 0.02)
192
+
193
+ def forward(self, x, y):
194
+ means = torch.mean(x, dim=(2, 3))
195
+ m = torch.mean(means, dim=-1, keepdim=True)
196
+ v = torch.var(means, dim=-1, keepdim=True)
197
+ means = (means - m) / (torch.sqrt(v + 1e-5))
198
+ h = self.instance_norm(x)
199
+
200
+ if self.bias:
201
+ gamma, alpha, beta = self.embed(y).chunk(3, dim=-1)
202
+ h = h + means[..., None, None] * alpha[..., None, None]
203
+ out = gamma.view(-1, self.num_features, 1, 1) * h + beta.view(-1, self.num_features, 1, 1)
204
+ else:
205
+ gamma, alpha = self.embed(y).chunk(2, dim=-1)
206
+ h = h + means[..., None, None] * alpha[..., None, None]
207
+ out = gamma.view(-1, self.num_features, 1, 1) * h
208
+ return out
pose.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from numpy import sin, cos
3
+ from math import pi as π
4
+ from my3d import camera_pose
5
+ from my.config import BaseConf
6
+ import random
7
+
8
+
9
+ def get_K(H, W, FoV_x):
10
+ FoV_x = FoV_x / 180 * π # to rad
11
+ f = 1 / np.tan(FoV_x / 2) * (W / 2)
12
+
13
+ K = np.array([
14
+ [f, 0, -(W/2 - 0.5)],
15
+ [0, -f, -(H/2 - 0.5)],
16
+ [0, 0, -1]
17
+ ])
18
+ return K
19
+
20
+
21
+ SIDEVIEW_PROMPTS = [
22
+ "front view of", "side view of", "backside view of", "side view of"
23
+ ]
24
+
25
+ TOPVIEW_PROMPT = "overhead view of"
26
+
27
+
28
+ def train_eye_with_prompts(r, n):
29
+ hs = np.random.rand(n) * 360
30
+ vs = np.random.rand(n) * np.deg2rad(100)
31
+ vs = np.clip(vs, 1e-2, π-1e-2)
32
+
33
+ prompts = []
34
+ v_thresh = np.deg2rad(30)
35
+ for i in range(n):
36
+ _p = ""
37
+ if vs[i] < v_thresh:
38
+ _p = TOPVIEW_PROMPT
39
+ else:
40
+ _a = hs[i]
41
+ _a = (_a + 45) % 360
42
+ _quad = int(_a // 90)
43
+ _p = SIDEVIEW_PROMPTS[_quad]
44
+ prompts.append(_p)
45
+
46
+ θ = np.deg2rad(hs)
47
+ # φ = v
48
+ φ = np.arccos(1 - 2 * (vs / π))
49
+
50
+ eyes = np.zeros((n, 3))
51
+
52
+ eyes[:, 0] = r * sin(φ) * cos(π-θ) # x
53
+ eyes[:, 2] = r * sin(φ) * sin(π-θ) # z
54
+ eyes[:, 1] = r * cos(φ) # y
55
+
56
+ return eyes, prompts
57
+
58
+
59
+ def spiral_poses(
60
+ radius, height,
61
+ num_steps=20, num_rounds=1,
62
+ center=np.array([0, 0, 0]), up=np.array([0, 1, 0]),
63
+ ):
64
+ eyes = []
65
+ for i in range(num_steps):
66
+ ratio = (i + 1) / num_steps
67
+ Δy = height * (1 - ratio)
68
+
69
+ θ = ratio * (360 * num_rounds)
70
+ θ = θ / 180 * π
71
+ # _r = max(radius * ratio, 0.5)
72
+ _r = max(radius * sin(ratio * π / 2), 0.5)
73
+ Δx, Δz = _r * np.array([np.cos(θ), np.sin(θ)])
74
+ eyes.append(center + [Δx, Δy, Δz])
75
+
76
+ poses = [
77
+ camera_pose(e, center - e, up) for e in eyes
78
+ ]
79
+ return poses
80
+
81
+
82
+ class PoseConfig(BaseConf):
83
+ rend_hw: int = 64
84
+ FoV: float = 60.0
85
+ R: float = 1.5
86
+
87
+ def make(self):
88
+ cfgs = self.dict()
89
+ hw = cfgs.pop("rend_hw")
90
+ cfgs["H"] = hw
91
+ cfgs["W"] = hw
92
+ return Poser(**cfgs)
93
+
94
+
95
+ class Poser():
96
+ def __init__(self, H, W, FoV, R):
97
+ self.H, self.W = H, W
98
+ self.R = R
99
+ self.K = get_K(H, W, FoV)
100
+
101
+ def sample_train(self, n):
102
+ eyes, prompts = train_eye_with_prompts(r=self.R, n=n)
103
+ up = np.array([0, 1, 0])
104
+ poses = [
105
+ camera_pose(e, -e, up) for e in eyes
106
+ ]
107
+ poses = np.stack(poses, 0)
108
+ # FoV during training: [40,70]
109
+ random_Ks = [
110
+ get_K(self.H, self.W, random.random() * 30 + 40)
111
+ for i in range(len(poses))
112
+ # self.K for i in range(len(poses))
113
+ ]
114
+ # return self.K, poses, prompts
115
+ return random_Ks, poses, prompts
116
+
117
+ def sample_test(self, n):
118
+ poses = spiral_poses(self.R, self.R, n, num_rounds=3)
119
+ poses = np.stack(poses, axis=0)
120
+ return self.K, poses
release/diffusion_ckpts/guided_ddpm/models/lsun_bedroom.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9faf136dc2375dcdb392b35cee9ca9dca1fd5257b2f3358613136395ec39231
3
+ size 2211383297
release/diffusion_ckpts/guided_ddpm/models/lsun_ffhq.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e409993ae12fc4cb8cd61aba7352c1bc0af0735e2debdd4b3c609280c8dc448b
3
+ size 2211370791
release/diffusion_ckpts/stable_diffusion/sd-v1-5.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1441589a6f3c5a53f5f54d0975a18a7feb7cdf0b0dee276dfc3331ae376a053
3
+ size 7703807346
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pydantic
2
+ tqdm
3
+ click
4
+ easydict
5
+ tabulate
6
+ imageio
7
+ einops
8
+ matplotlib
9
+ omegaconf==2.1.1
10
+ torchmetrics==0.6.0
11
+ pytorch-lightning==1.4.2
12
+ transformers
13
+ kornia==0.6.0
14
+ git+https:///github.com/openai/CLIP.git#egg=clip
15
+ imageio[ffmpeg]
16
+ imageio[pyav]
17
+ --extra-index-url https://download.pytorch.org/whl/cu116
18
+ torch
19
+ torchvision
20
+ torchaudio
21
+ -e git+https://github.com/CompVis/taming-transformers.git#egg=taming-transformers
run_img_sampling.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import numpy as np
3
+ import torch
4
+
5
+ from misc import torch_samps_to_imgs
6
+ from adapt import Karras, ScoreAdapter, power_schedule
7
+ from adapt_gddpm import GuidedDDPM
8
+ from adapt_ncsn import NCSN as _NCSN
9
+ # from adapt_vesde import VESDE # not included to prevent import conflicts
10
+ from adapt_sd import StableDiffusion
11
+
12
+ from my.utils import tqdm, EventStorage, HeartBeat, EarlyLoopBreak
13
+ from my.config import BaseConf, dispatch
14
+ from my.utils.seed import seed_everything
15
+
16
+
17
+ class GDDPM(BaseConf):
18
+ """Guided DDPM from OpenAI"""
19
+ model: str = "m_lsun_256"
20
+ lsun_cat: str = "bedroom"
21
+ imgnet_cat: int = -1
22
+
23
+ def make(self):
24
+ args = self.dict()
25
+ model = GuidedDDPM(**args)
26
+ return model
27
+
28
+
29
+ class SD(BaseConf):
30
+ """Stable Diffusion"""
31
+ variant: str = "v1"
32
+ v2_highres: bool = False
33
+ prompt: str = "a photograph of an astronaut riding a horse"
34
+ scale: float = 3.0 # classifier free guidance scale
35
+ precision: str = 'autocast'
36
+
37
+ def make(self):
38
+ args = self.dict()
39
+ model = StableDiffusion(**args)
40
+ return model
41
+
42
+
43
+ class SDE(BaseConf):
44
+ def make(self):
45
+ args = self.dict()
46
+ model = VESDE(**args)
47
+ return model
48
+
49
+
50
+ class NCSN(BaseConf):
51
+ def make(self):
52
+ args = self.dict()
53
+ model = _NCSN(**args)
54
+ return model
55
+
56
+
57
+ class KarrasGen(BaseConf):
58
+ family: str = "gddpm"
59
+ gddpm: GDDPM = GDDPM()
60
+ sd: SD = SD()
61
+ # sde: SDE = SDE()
62
+ ncsn: NCSN = NCSN()
63
+
64
+ batch_size: int = 10
65
+ num_images: int = 1250
66
+ num_t: int = 40
67
+ σ_max: float = 80.0
68
+ heun: bool = True
69
+ langevin: bool = False
70
+ cls_scaling: float = 1.0 # classifier guidance scaling
71
+
72
+ def run(self):
73
+ args = self.dict()
74
+ family = args.pop("family")
75
+ model = getattr(self, family).make()
76
+ self.karras_generate(model, **args)
77
+
78
+ @staticmethod
79
+ def karras_generate(
80
+ model: ScoreAdapter,
81
+ batch_size, num_images, σ_max, num_t, langevin, heun, cls_scaling,
82
+ **kwargs
83
+ ):
84
+ del kwargs # removed extra args
85
+ num_batches = num_images // batch_size
86
+
87
+ fuse = EarlyLoopBreak(5)
88
+ with tqdm(total=num_batches) as pbar, \
89
+ HeartBeat(pbar) as hbeat, \
90
+ EventStorage() as metric:
91
+
92
+ all_imgs = []
93
+
94
+ for _ in range(num_batches):
95
+ if fuse.on_break():
96
+ break
97
+
98
+ pipeline = Karras.inference(
99
+ model, batch_size, num_t,
100
+ init_xs=None, heun=heun, σ_max=σ_max,
101
+ langevin=langevin, cls_scaling=cls_scaling
102
+ )
103
+
104
+ for imgs in tqdm(pipeline, total=num_t+1, disable=False):
105
+ # _std = imgs.std().item()
106
+ # print(_std)
107
+ hbeat.beat()
108
+ pass
109
+
110
+ if isinstance(model, StableDiffusion):
111
+ imgs = model.decode(imgs)
112
+
113
+ imgs = torch_samps_to_imgs(imgs, uncenter=model.samps_centered())
114
+ all_imgs.append(imgs)
115
+
116
+ pbar.update()
117
+
118
+ all_imgs = np.concatenate(all_imgs, axis=0)
119
+ metric.put_artifact("imgs", ".npy", lambda fn: np.save(fn, all_imgs))
120
+ metric.step()
121
+ hbeat.done()
122
+
123
+
124
+ class SMLDGen(BaseConf):
125
+ family: str = "ncsn"
126
+ gddpm: GDDPM = GDDPM()
127
+ # sde: SDE = SDE()
128
+ ncsn: NCSN = NCSN()
129
+
130
+ batch_size: int = 16
131
+ num_images: int = 16
132
+ num_stages: int = 80
133
+ num_steps: int = 15
134
+ σ_max: float = 80.0
135
+ ε: float = 1e-5
136
+
137
+ def run(self):
138
+ args = self.dict()
139
+ family = args.pop("family")
140
+ model = getattr(self, family).make()
141
+ self.smld_generate(model, **args)
142
+
143
+ @staticmethod
144
+ def smld_generate(
145
+ model: ScoreAdapter,
146
+ batch_size, num_images, num_stages, num_steps, σ_max, ε,
147
+ **kwargs
148
+ ):
149
+ num_batches = num_images // batch_size
150
+ σs = power_schedule(σ_max, model.σ_min, num_stages)
151
+ σs = [model.snap_t_to_nearest_tick(σ)[0] for σ in σs]
152
+
153
+ fuse = EarlyLoopBreak(5)
154
+ with tqdm(total=num_batches) as pbar, \
155
+ HeartBeat(pbar) as hbeat, \
156
+ EventStorage() as metric:
157
+
158
+ all_imgs = []
159
+
160
+ for _ in range(num_batches):
161
+ if fuse.on_break():
162
+ break
163
+
164
+ init_xs = torch.rand(batch_size, *model.data_shape(), device=model.device)
165
+ if model.samps_centered():
166
+ init_xs = init_xs * 2 - 1 # [0, 1] -> [-1, 1]
167
+
168
+ pipeline = smld_inference(
169
+ model, σs, num_steps, ε, init_xs
170
+ )
171
+
172
+ for imgs in tqdm(pipeline, total=(num_stages * num_steps)+1, disable=False):
173
+ pbar.set_description(f"{imgs.max().item():.3f}")
174
+ metric.put_scalars(
175
+ max=imgs.max().item(), min=imgs.min().item(), std=imgs.std().item()
176
+ )
177
+ metric.step()
178
+ hbeat.beat()
179
+
180
+ pbar.update()
181
+ imgs = torch_samps_to_imgs(imgs, uncenter=model.samps_centered())
182
+ all_imgs.append(imgs)
183
+
184
+ all_imgs = np.concatenate(all_imgs, axis=0)
185
+ metric.put_artifact("imgs", ".npy", lambda fn: np.save(fn, all_imgs))
186
+ metric.step()
187
+ hbeat.done()
188
+
189
+
190
+ def smld_inference(model, σs, num_steps, ε, init_xs):
191
+ from math import sqrt
192
+ # not doing conditioning or cls guidance; for gddpm only lsun works; fine.
193
+
194
+ xs = init_xs
195
+ yield xs
196
+
197
+ for i in range(len(σs)):
198
+ α_i = ε * ((σs[i] / σs[-1]) ** 2)
199
+ for _ in range(num_steps):
200
+ grad = model.score(xs, σs[i])
201
+ z = torch.randn_like(xs)
202
+ xs = xs + α_i * grad + sqrt(2 * α_i) * z
203
+ yield xs
204
+
205
+
206
+ def load_np_imgs(fname):
207
+ fname = Path(fname)
208
+ data = np.load(fname)
209
+ if fname.suffix == ".npz":
210
+ imgs = data['arr_0']
211
+ else:
212
+ imgs = data
213
+ return imgs
214
+
215
+
216
+ def visualize(max_n_imgs=16):
217
+ import torchvision.utils as vutils
218
+ from imageio import imwrite
219
+ from einops import rearrange
220
+
221
+ all_imgs = load_np_imgs("imgs/step_0.npy")
222
+
223
+ imgs = all_imgs[:max_n_imgs]
224
+ imgs = rearrange(imgs, "N H W C -> N C H W", C=3)
225
+ imgs = torch.from_numpy(imgs)
226
+ pane = vutils.make_grid(imgs, padding=2, nrow=4)
227
+ pane = rearrange(pane, "C H W -> H W C", C=3)
228
+ pane = pane.numpy()
229
+ imwrite("preview.jpg", pane)
230
+
231
+
232
+ if __name__ == "__main__":
233
+ seed_everything(0)
234
+ dispatch(KarrasGen)
235
+ visualize(16)
run_nerf.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from pydantic import validator
3
+
4
+ from my.config import BaseConf, SingleOrList, dispatch
5
+ from my.utils.seed import seed_everything
6
+
7
+ import numpy as np
8
+ from voxnerf.vox import VOXRF_REGISTRY
9
+ from voxnerf.pipelines import train
10
+
11
+
12
+ class VoxConfig(BaseConf):
13
+ model_type: str = "VoxRF"
14
+ bbox_len: float = 1.5
15
+ grid_size: SingleOrList(int) = [128, 128, 128]
16
+ step_ratio: float = 0.5
17
+ density_shift: float = -10.
18
+ ray_march_weight_thres: float = 0.0001
19
+ c: int = 3
20
+ blend_bg_texture: bool = False
21
+ bg_texture_hw: int = 64
22
+
23
+ @validator("grid_size")
24
+ def check_gsize(cls, grid_size):
25
+ if isinstance(grid_size, int):
26
+ return [grid_size, ] * 3
27
+ else:
28
+ assert len(grid_size) == 3
29
+ return grid_size
30
+
31
+ def make(self):
32
+ params = self.dict()
33
+ m_type = params.pop("model_type")
34
+ model_fn = VOXRF_REGISTRY.get(m_type)
35
+
36
+ radius = params.pop('bbox_len')
37
+ aabb = radius * np.array([
38
+ [-1, -1, -1],
39
+ [1, 1, 1]
40
+ ])
41
+ model = model_fn(aabb=aabb, **params)
42
+ return model
43
+
44
+
45
+ class TrainerConfig(BaseConf):
46
+ model: VoxConfig = VoxConfig()
47
+ scene: str = "lego"
48
+ n_epoch: int = 2
49
+ bs: int = 4096
50
+ lr: float = 0.02
51
+
52
+ def run(self):
53
+ args = self.dict()
54
+ args.pop("model")
55
+
56
+ model = self.model.make()
57
+ train(model, **args)
58
+
59
+
60
+ if __name__ == "__main__":
61
+ seed_everything(0)
62
+ dispatch(TrainerConfig)
run_sjc.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+ from einops import rearrange
6
+ from imageio import imwrite
7
+ from pydantic import validator
8
+
9
+ from my.utils import (
10
+ tqdm, EventStorage, HeartBeat, EarlyLoopBreak,
11
+ get_event_storage, get_heartbeat, read_stats
12
+ )
13
+ from my.config import BaseConf, dispatch, optional_load_config
14
+ from my.utils.seed import seed_everything
15
+
16
+ from adapt import ScoreAdapter, karras_t_schedule
17
+ from run_img_sampling import GDDPM, SD, StableDiffusion
18
+ from misc import torch_samps_to_imgs
19
+ from pose import PoseConfig
20
+
21
+ from run_nerf import VoxConfig
22
+ from voxnerf.utils import every
23
+ from voxnerf.render import (
24
+ as_torch_tsrs, rays_from_img, ray_box_intersect, render_ray_bundle
25
+ )
26
+ from voxnerf.vis import stitch_vis, bad_vis as nerf_vis
27
+
28
+
29
+ device_glb = torch.device("cuda")
30
+
31
+
32
+ def tsr_stats(tsr):
33
+ return {
34
+ "mean": tsr.mean().item(),
35
+ "std": tsr.std().item(),
36
+ "max": tsr.max().item(),
37
+ }
38
+
39
+
40
+ class SJC(BaseConf):
41
+ family: str = "sd"
42
+ gddpm: GDDPM = GDDPM()
43
+ sd: SD = SD(
44
+ variant="v1",
45
+ prompt="A high quality photo of a delicious burger",
46
+ scale=100.0
47
+ )
48
+ lr: float = 0.05
49
+ n_steps: int = 10000
50
+ vox: VoxConfig = VoxConfig(
51
+ model_type="V_SD", grid_size=100, density_shift=-1.0, c=3,
52
+ blend_bg_texture=True, bg_texture_hw=4,
53
+ bbox_len=1.0
54
+ )
55
+ pose: PoseConfig = PoseConfig(rend_hw=64, FoV=60.0, R=1.5)
56
+
57
+ emptiness_scale: int = 10
58
+ emptiness_weight: int = 1e4
59
+ emptiness_step: float = 0.5
60
+ emptiness_multiplier: float = 20.0
61
+
62
+ depth_weight: int = 0
63
+
64
+ var_red: bool = True
65
+
66
+ @validator("vox")
67
+ def check_vox(cls, vox_cfg, values):
68
+ family = values['family']
69
+ if family == "sd":
70
+ vox_cfg.c = 4
71
+ return vox_cfg
72
+
73
+ def run(self):
74
+ cfgs = self.dict()
75
+
76
+ family = cfgs.pop("family")
77
+ model = getattr(self, family).make()
78
+
79
+ cfgs.pop("vox")
80
+ vox = self.vox.make()
81
+
82
+ cfgs.pop("pose")
83
+ poser = self.pose.make()
84
+
85
+ sjc_3d(**cfgs, poser=poser, model=model, vox=vox)
86
+
87
+
88
+ def sjc_3d(
89
+ poser, vox, model: ScoreAdapter,
90
+ lr, n_steps, emptiness_scale, emptiness_weight, emptiness_step, emptiness_multiplier,
91
+ depth_weight, var_red, **kwargs
92
+ ):
93
+ del kwargs
94
+
95
+ assert model.samps_centered()
96
+ _, target_H, target_W = model.data_shape()
97
+ bs = 1
98
+ aabb = vox.aabb.T.cpu().numpy()
99
+ vox = vox.to(device_glb)
100
+ opt = torch.optim.Adamax(vox.opt_params(), lr=lr)
101
+
102
+ H, W = poser.H, poser.W
103
+ Ks, poses, prompt_prefixes = poser.sample_train(n_steps)
104
+
105
+ ts = model.us[30:-10]
106
+ fuse = EarlyLoopBreak(5)
107
+
108
+ same_noise = torch.randn(1, 4, H, W, device=model.device).repeat(bs, 1, 1, 1)
109
+
110
+ with tqdm(total=n_steps) as pbar, \
111
+ HeartBeat(pbar) as hbeat, \
112
+ EventStorage() as metric:
113
+ for i in range(n_steps):
114
+ if fuse.on_break():
115
+ break
116
+
117
+ p = f"{prompt_prefixes[i]} {model.prompt}"
118
+ score_conds = model.prompts_emb([p])
119
+
120
+ y, depth, ws = render_one_view(vox, aabb, H, W, Ks[i], poses[i], return_w=True)
121
+
122
+ if isinstance(model, StableDiffusion):
123
+ pass
124
+ else:
125
+ y = torch.nn.functional.interpolate(y, (target_H, target_W), mode='bilinear')
126
+
127
+ opt.zero_grad()
128
+
129
+ with torch.no_grad():
130
+ chosen_σs = np.random.choice(ts, bs, replace=False)
131
+ chosen_σs = chosen_σs.reshape(-1, 1, 1, 1)
132
+ chosen_σs = torch.as_tensor(chosen_σs, device=model.device, dtype=torch.float32)
133
+ # chosen_σs = us[i]
134
+
135
+ noise = torch.randn(bs, *y.shape[1:], device=model.device)
136
+
137
+ zs = y + chosen_σs * noise
138
+ Ds = model.denoise(zs, chosen_σs, **score_conds)
139
+
140
+ if var_red:
141
+ grad = (Ds - y) / chosen_σs
142
+ else:
143
+ grad = (Ds - zs) / chosen_σs
144
+
145
+ grad = grad.mean(0, keepdim=True)
146
+
147
+ y.backward(-grad, retain_graph=True)
148
+
149
+ if depth_weight > 0:
150
+ center_depth = depth[7:-7, 7:-7]
151
+ border_depth_mean = (depth.sum() - center_depth.sum()) / (64*64-50*50)
152
+ center_depth_mean = center_depth.mean()
153
+ depth_diff = center_depth_mean - border_depth_mean
154
+ depth_loss = - torch.log(depth_diff + 1e-12)
155
+ depth_loss = depth_weight * depth_loss
156
+ depth_loss.backward(retain_graph=True)
157
+
158
+ emptiness_loss = torch.log(1 + emptiness_scale * ws).mean()
159
+ emptiness_loss = emptiness_weight * emptiness_loss
160
+ if emptiness_step * n_steps <= i:
161
+ emptiness_loss *= emptiness_multiplier
162
+ emptiness_loss.backward()
163
+
164
+ opt.step()
165
+
166
+ metric.put_scalars(**tsr_stats(y))
167
+
168
+ if every(pbar, percent=1):
169
+ with torch.no_grad():
170
+ if isinstance(model, StableDiffusion):
171
+ y = model.decode(y)
172
+ vis_routine(metric, y, depth)
173
+
174
+ # if every(pbar, step=2500):
175
+ # metric.put_artifact(
176
+ # "ckpt", ".pt", lambda fn: torch.save(vox.state_dict(), fn)
177
+ # )
178
+ # with EventStorage("test"):
179
+ # evaluate(model, vox, poser)
180
+
181
+ metric.step()
182
+ pbar.update()
183
+ pbar.set_description(p)
184
+ hbeat.beat()
185
+
186
+ metric.put_artifact(
187
+ "ckpt", ".pt", lambda fn: torch.save(vox.state_dict(), fn)
188
+ )
189
+ with EventStorage("test"):
190
+ evaluate(model, vox, poser)
191
+
192
+ metric.step()
193
+
194
+ hbeat.done()
195
+
196
+
197
+ @torch.no_grad()
198
+ def evaluate(score_model, vox, poser):
199
+ H, W = poser.H, poser.W
200
+ vox.eval()
201
+ K, poses = poser.sample_test(100)
202
+
203
+ fuse = EarlyLoopBreak(5)
204
+ metric = get_event_storage()
205
+ hbeat = get_heartbeat()
206
+
207
+ aabb = vox.aabb.T.cpu().numpy()
208
+ vox = vox.to(device_glb)
209
+
210
+ num_imgs = len(poses)
211
+
212
+ for i in (pbar := tqdm(range(num_imgs))):
213
+ if fuse.on_break():
214
+ break
215
+
216
+ pose = poses[i]
217
+ y, depth = render_one_view(vox, aabb, H, W, K, pose)
218
+ if isinstance(score_model, StableDiffusion):
219
+ y = score_model.decode(y)
220
+ vis_routine(metric, y, depth)
221
+
222
+ metric.step()
223
+ hbeat.beat()
224
+
225
+ metric.flush_history()
226
+
227
+ metric.put_artifact(
228
+ "view_seq", ".mp4",
229
+ lambda fn: stitch_vis(fn, read_stats(metric.output_dir, "view")[1])
230
+ )
231
+
232
+ metric.step()
233
+
234
+
235
+ def render_one_view(vox, aabb, H, W, K, pose, return_w=False):
236
+ N = H * W
237
+ ro, rd = rays_from_img(H, W, K, pose)
238
+ ro, rd, t_min, t_max = scene_box_filter(ro, rd, aabb)
239
+ assert len(ro) == N, "for now all pixels must be in"
240
+ ro, rd, t_min, t_max = as_torch_tsrs(vox.device, ro, rd, t_min, t_max)
241
+ rgbs, depth, weights = render_ray_bundle(vox, ro, rd, t_min, t_max)
242
+
243
+ rgbs = rearrange(rgbs, "(h w) c -> 1 c h w", h=H, w=W)
244
+ depth = rearrange(depth, "(h w) 1 -> h w", h=H, w=W)
245
+ if return_w:
246
+ return rgbs, depth, weights
247
+ else:
248
+ return rgbs, depth
249
+
250
+
251
+ def scene_box_filter(ro, rd, aabb):
252
+ _, t_min, t_max = ray_box_intersect(ro, rd, aabb)
253
+ # do not render what's behind the ray origin
254
+ t_min, t_max = np.maximum(t_min, 0), np.maximum(t_max, 0)
255
+ return ro, rd, t_min, t_max
256
+
257
+
258
+ def vis_routine(metric, y, depth):
259
+ pane = nerf_vis(y, depth, final_H=256)
260
+ im = torch_samps_to_imgs(y)[0]
261
+ depth = depth.cpu().numpy()
262
+ metric.put_artifact("view", ".png", lambda fn: imwrite(fn, pane))
263
+ metric.put_artifact("img", ".png", lambda fn: imwrite(fn, im))
264
+ metric.put_artifact("depth", ".npy", lambda fn: np.save(fn, depth))
265
+
266
+
267
+ def evaluate_ckpt():
268
+ cfg = optional_load_config(fname="full_config.yml")
269
+ assert len(cfg) > 0, "can't find cfg file"
270
+ mod = SJC(**cfg)
271
+
272
+ family = cfg.pop("family")
273
+ model: ScoreAdapter = getattr(mod, family).make()
274
+ vox = mod.vox.make()
275
+ poser = mod.pose.make()
276
+
277
+ pbar = tqdm(range(1))
278
+
279
+ with EventStorage(), HeartBeat(pbar):
280
+ ckpt_fname = latest_ckpt()
281
+ state = torch.load(ckpt_fname, map_location="cpu")
282
+ vox.load_state_dict(state)
283
+ vox.to(device_glb)
284
+
285
+ with EventStorage("test"):
286
+ evaluate(model, vox, poser)
287
+
288
+
289
+ def latest_ckpt():
290
+ ts, ys = read_stats("./", "ckpt")
291
+ assert len(ys) > 0
292
+ return ys[-1]
293
+
294
+
295
+ if __name__ == "__main__":
296
+ seed_everything(0)
297
+ dispatch(SJC)
298
+ # evaluate_ckpt()