Spaces:
Build error
Build error
amankishore
commited on
Commit
•
7a11626
1
Parent(s):
0426313
Updated app.py
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +3 -0
- .gitignore +150 -0
- LICENSE +82 -0
- README-orig.md +220 -0
- adapt.py +163 -0
- adapt_gddpm.py +562 -0
- adapt_ncsn.py +101 -0
- adapt_sd.py +235 -0
- adapt_vesde.py +84 -0
- app.py +155 -0
- env.json +3 -0
- guided_diffusion/README.md +5 -0
- guided_diffusion/__init__.py +3 -0
- guided_diffusion/fp16_util.py +237 -0
- guided_diffusion/gaussian_diffusion.py +908 -0
- guided_diffusion/losses.py +77 -0
- guided_diffusion/nn.py +170 -0
- guided_diffusion/respace.py +128 -0
- guided_diffusion/script_util.py +452 -0
- guided_diffusion/unet.py +894 -0
- misc.py +53 -0
- my/README.md +2 -0
- my/__init__.py +0 -0
- my/config.py +234 -0
- my/registry.py +62 -0
- my/utils/__init__.py +4 -0
- my/utils/debug.py +15 -0
- my/utils/event.py +142 -0
- my/utils/heartbeat.py +78 -0
- my/utils/plot.py +9 -0
- my/utils/seed.py +21 -0
- my/utils/ticker.py +18 -0
- my/utils/tqdm.py +10 -0
- my3d.py +160 -0
- ncsn/__init__.py +0 -0
- ncsn/bedroom.yml +69 -0
- ncsn/ema.py +47 -0
- ncsn/layers.py +456 -0
- ncsn/ncsnv2.py +314 -0
- ncsn/normalization.py +208 -0
- pose.py +120 -0
- release/diffusion_ckpts/guided_ddpm/models/lsun_bedroom.pt +3 -0
- release/diffusion_ckpts/guided_ddpm/models/lsun_ffhq.pt +3 -0
- release/diffusion_ckpts/stable_diffusion/sd-v1-5.ckpt +3 -0
- requirements.txt +16 -0
- run_img_sampling.py +235 -0
- run_nerf.py +62 -0
- run_sjc.py +298 -0
- sd1/__init__.py +0 -0
- sd1/configs/v1-finetune_textual_inverison.yaml +106 -0
.gitattributes
CHANGED
@@ -32,3 +32,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
release/diffusion_ckpts/guided_ddpm/models/lsun_bedroom.pt filter=lfs diff=lfs merge=lfs -text
|
36 |
+
release/diffusion_ckpts/guided_ddpm/models/lsun_ffhq.pt filter=lfs diff=lfs merge=lfs -text
|
37 |
+
release/diffusion_ckpts/stable_diffusion/sd-v1-5.ckpt filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.png
|
2 |
+
|
3 |
+
# sd1/
|
4 |
+
# sd2/
|
5 |
+
|
6 |
+
sde/
|
7 |
+
|
8 |
+
notebooks/
|
9 |
+
out/
|
10 |
+
slurm_outputs/
|
11 |
+
|
12 |
+
FID/torch_utils/
|
13 |
+
FID/dnnlib/
|
14 |
+
|
15 |
+
# Byte-compiled / optimized / DLL files
|
16 |
+
__pycache__/
|
17 |
+
*.py[cod]
|
18 |
+
*$py.class
|
19 |
+
|
20 |
+
# C extensions
|
21 |
+
*.so
|
22 |
+
|
23 |
+
# Distribution / packaging
|
24 |
+
.Python
|
25 |
+
build/
|
26 |
+
develop-eggs/
|
27 |
+
dist/
|
28 |
+
downloads/
|
29 |
+
eggs/
|
30 |
+
.eggs/
|
31 |
+
lib/
|
32 |
+
lib64/
|
33 |
+
parts/
|
34 |
+
sdist/
|
35 |
+
var/
|
36 |
+
wheels/
|
37 |
+
pip-wheel-metadata/
|
38 |
+
share/python-wheels/
|
39 |
+
*.egg-info/
|
40 |
+
.installed.cfg
|
41 |
+
*.egg
|
42 |
+
MANIFEST
|
43 |
+
|
44 |
+
# PyInstaller
|
45 |
+
# Usually these files are written by a python script from a template
|
46 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
47 |
+
*.manifest
|
48 |
+
*.spec
|
49 |
+
|
50 |
+
# Installer logs
|
51 |
+
pip-log.txt
|
52 |
+
pip-delete-this-directory.txt
|
53 |
+
|
54 |
+
# Unit test / coverage reports
|
55 |
+
htmlcov/
|
56 |
+
.tox/
|
57 |
+
.nox/
|
58 |
+
.coverage
|
59 |
+
.coverage.*
|
60 |
+
.cache
|
61 |
+
nosetests.xml
|
62 |
+
coverage.xml
|
63 |
+
*.cover
|
64 |
+
*.py,cover
|
65 |
+
.hypothesis/
|
66 |
+
.pytest_cache/
|
67 |
+
|
68 |
+
# Translations
|
69 |
+
*.mo
|
70 |
+
*.pot
|
71 |
+
|
72 |
+
# Django stuff:
|
73 |
+
*.log
|
74 |
+
local_settings.py
|
75 |
+
db.sqlite3
|
76 |
+
db.sqlite3-journal
|
77 |
+
|
78 |
+
# Flask stuff:
|
79 |
+
instance/
|
80 |
+
.webassets-cache
|
81 |
+
|
82 |
+
# Scrapy stuff:
|
83 |
+
.scrapy
|
84 |
+
|
85 |
+
# Sphinx documentation
|
86 |
+
docs/_build/
|
87 |
+
|
88 |
+
# PyBuilder
|
89 |
+
target/
|
90 |
+
|
91 |
+
# Jupyter Notebook
|
92 |
+
.ipynb_checkpoints
|
93 |
+
|
94 |
+
# IPython
|
95 |
+
profile_default/
|
96 |
+
ipython_config.py
|
97 |
+
|
98 |
+
# pyenv
|
99 |
+
.python-version
|
100 |
+
|
101 |
+
# pipenv
|
102 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
103 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
104 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
105 |
+
# install all needed dependencies.
|
106 |
+
#Pipfile.lock
|
107 |
+
|
108 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
109 |
+
__pypackages__/
|
110 |
+
|
111 |
+
# Celery stuff
|
112 |
+
celerybeat-schedule
|
113 |
+
celerybeat.pid
|
114 |
+
|
115 |
+
# SageMath parsed files
|
116 |
+
*.sage.py
|
117 |
+
|
118 |
+
# Environments
|
119 |
+
.env
|
120 |
+
.venv
|
121 |
+
env/
|
122 |
+
venv/
|
123 |
+
ENV/
|
124 |
+
env.bak/
|
125 |
+
venv.bak/
|
126 |
+
|
127 |
+
# Spyder project settings
|
128 |
+
.spyderproject
|
129 |
+
.spyproject
|
130 |
+
|
131 |
+
# Rope project settings
|
132 |
+
.ropeproject
|
133 |
+
|
134 |
+
# mkdocs documentation
|
135 |
+
/site
|
136 |
+
|
137 |
+
# mypy
|
138 |
+
.mypy_cache/
|
139 |
+
.dmypy.json
|
140 |
+
dmypy.json
|
141 |
+
|
142 |
+
# Pyre type checker
|
143 |
+
.pyre/
|
144 |
+
|
145 |
+
ckpt/
|
146 |
+
depth/
|
147 |
+
img/
|
148 |
+
test*/
|
149 |
+
view/
|
150 |
+
vis/
|
LICENSE
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright (c) 2022 Score Jacobian Chaining authors
|
2 |
+
|
3 |
+
CreativeML Open RAIL-M
|
4 |
+
dated August 22, 2022
|
5 |
+
|
6 |
+
Section I: PREAMBLE
|
7 |
+
|
8 |
+
Multimodal generative models are being widely adopted and used, and have the potential to transform the way artists, among other individuals, conceive and benefit from AI or ML technologies as a tool for content creation.
|
9 |
+
|
10 |
+
Notwithstanding the current and potential benefits that these artifacts can bring to society at large, there are also concerns about potential misuses of them, either due to their technical limitations or ethical considerations.
|
11 |
+
|
12 |
+
In short, this license strives for both the open and responsible downstream use of the accompanying model. When it comes to the open character, we took inspiration from open source permissive licenses regarding the grant of IP rights. Referring to the downstream responsible use, we added use-based restrictions not permitting the use of the Model in very specific scenarios, in order for the licensor to be able to enforce the license in case potential misuses of the Model may occur. At the same time, we strive to promote open and responsible research on generative models for art and content generation.
|
13 |
+
|
14 |
+
Even though downstream derivative versions of the model could be released under different licensing terms, the latter will always have to include - at minimum - the same use-based restrictions as the ones in the original license (this license). We believe in the intersection between open and responsible AI development; thus, this License aims to strike a balance between both in order to enable responsible open-science in the field of AI.
|
15 |
+
|
16 |
+
This License governs the use of the model (and its derivatives) and is informed by the model card associated with the model.
|
17 |
+
|
18 |
+
NOW THEREFORE, You and Licensor agree as follows:
|
19 |
+
|
20 |
+
1. Definitions
|
21 |
+
|
22 |
+
- "License" means the terms and conditions for use, reproduction, and Distribution as defined in this document.
|
23 |
+
- "Data" means a collection of information and/or content extracted from the dataset used with the Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not licensed under this License.
|
24 |
+
- "Output" means the results of operating a Model as embodied in informational content resulting therefrom.
|
25 |
+
- "Model" means any accompanying machine-learning based assemblies (including checkpoints), consisting of learnt weights, parameters (including optimizer states), corresponding to the model architecture as embodied in the Complementary Material, that have been trained or tuned, in whole or in part on the Data, using the Complementary Material.
|
26 |
+
- "Derivatives of the Model" means all modifications to the Model, works based on the Model, or any other model which is created or initialized by transfer of patterns of the weights, parameters, activations or output of the Model, to the other model, in order to cause the other model to perform similarly to the Model, including - but not limited to - distillation methods entailing the use of intermediate data representations or methods based on the generation of synthetic data by the Model for training the other model.
|
27 |
+
- "Complementary Material" means the accompanying source code and scripts used to define, run, load, benchmark or evaluate the Model, and used to prepare data for training or evaluation, if any. This includes any accompanying documentation, tutorials, examples, etc, if any.
|
28 |
+
- "Distribution" means any transmission, reproduction, publication or other sharing of the Model or Derivatives of the Model to a third party, including providing the Model as a hosted service made available by electronic or other remote means - e.g. API-based or web access.
|
29 |
+
- "Licensor" means the copyright owner or entity authorized by the copyright owner that is granting the License, including the persons or entities that may have rights in the Model and/or distributing the Model.
|
30 |
+
- "You" (or "Your") means an individual or Legal Entity exercising permissions granted by this License and/or making use of the Model for whichever purpose and in any field of use, including usage of the Model in an end-use application - e.g. chatbot, translator, image generator.
|
31 |
+
- "Third Parties" means individuals or legal entities that are not under common control with Licensor or You.
|
32 |
+
- "Contribution" means any work of authorship, including the original version of the Model and any modifications or additions to that Model or Derivatives of the Model thereof, that is intentionally submitted to Licensor for inclusion in the Model by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Model, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
|
33 |
+
- "Contributor" means Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Model.
|
34 |
+
|
35 |
+
Section II: INTELLECTUAL PROPERTY RIGHTS
|
36 |
+
|
37 |
+
Both copyright and patent grants apply to the Model, Derivatives of the Model and Complementary Material. The Model and Derivatives of the Model are subject to additional terms as described in Section III.
|
38 |
+
|
39 |
+
2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare, publicly display, publicly perform, sublicense, and distribute the Complementary Material, the Model, and Derivatives of the Model.
|
40 |
+
3. Grant of Patent License. Subject to the terms and conditions of this License and where and as applicable, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this paragraph) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Model and the Complementary Material, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Model to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model and/or Complementary Material or a Contribution incorporated within the Model and/or Complementary Material constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for the Model and/or Work shall terminate as of the date such litigation is asserted or filed.
|
41 |
+
|
42 |
+
Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
|
43 |
+
|
44 |
+
4. Distribution and Redistribution. You may host for Third Party remote access purposes (e.g. software-as-a-service), reproduce and distribute copies of the Model or Derivatives of the Model thereof in any medium, with or without modifications, provided that You meet the following conditions:
|
45 |
+
Use-based restrictions as referenced in paragraph 5 MUST be included as an enforceable provision by You in any type of legal agreement (e.g. a license) governing the use and/or distribution of the Model or Derivatives of the Model, and You shall give notice to subsequent users You Distribute to, that the Model or Derivatives of the Model are subject to paragraph 5. This provision does not apply to the use of Complementary Material.
|
46 |
+
You must give any Third Party recipients of the Model or Derivatives of the Model a copy of this License;
|
47 |
+
You must cause any modified files to carry prominent notices stating that You changed the files;
|
48 |
+
You must retain all copyright, patent, trademark, and attribution notices excluding those notices that do not pertain to any part of the Model, Derivatives of the Model.
|
49 |
+
You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions - respecting paragraph 4.a. - for use, reproduction, or Distribution of Your modifications, or for any such Derivatives of the Model as a whole, provided Your use, reproduction, and Distribution of the Model otherwise complies with the conditions stated in this License.
|
50 |
+
5. Use-based restrictions. The restrictions set forth in Attachment A are considered Use-based restrictions. Therefore You cannot use the Model and the Derivatives of the Model for the specified restricted uses. You may use the Model subject to this License, including only for lawful purposes and in accordance with the License. Use may include creating any content with, finetuning, updating, running, training, evaluating and/or reparametrizing the Model. You shall require all of Your users who use the Model or a Derivative of the Model to comply with the terms of this paragraph (paragraph 5).
|
51 |
+
6. The Output You Generate. Except as set forth herein, Licensor claims no rights in the Output You generate using the Model. You are accountable for the Output you generate and its subsequent uses. No use of the output can contravene any provision as stated in the License.
|
52 |
+
|
53 |
+
Section IV: OTHER PROVISIONS
|
54 |
+
|
55 |
+
7. Updates and Runtime Restrictions. To the maximum extent permitted by law, Licensor reserves the right to restrict (remotely or otherwise) usage of the Model in violation of this License, update the Model through electronic means, or modify the Output of the Model based on updates. You shall undertake reasonable efforts to use the latest version of the Model.
|
56 |
+
8. Trademarks and related. Nothing in this License permits You to make use of Licensors’ trademarks, trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between the parties; and any rights not expressly granted herein are reserved by the Licensors.
|
57 |
+
9. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Model and the Complementary Material (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Model, Derivatives of the Model, and the Complementary Material and assume any risks associated with Your exercise of permissions under this License.
|
58 |
+
10. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Model and the Complementary Material (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
|
59 |
+
11. Accepting Warranty or Additional Liability. While redistributing the Model, Derivatives of the Model and the Complementary Material thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
|
60 |
+
12. If any provision of this License is held to be invalid, illegal or unenforceable, the remaining provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
|
61 |
+
|
62 |
+
END OF TERMS AND CONDITIONS
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
Attachment A
|
68 |
+
|
69 |
+
Use Restrictions
|
70 |
+
|
71 |
+
You agree not to use the Model or Derivatives of the Model:
|
72 |
+
- In any way that violates any applicable national, federal, state, local or international law or regulation;
|
73 |
+
- For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
|
74 |
+
- To generate or disseminate verifiably false information and/or content with the purpose of harming others;
|
75 |
+
- To generate or disseminate personal identifiable information that can be used to harm an individual;
|
76 |
+
- To defame, disparage or otherwise harass others;
|
77 |
+
- For fully automated decision making that adversely impacts an individual’s legal rights or otherwise creates or modifies a binding, enforceable obligation;
|
78 |
+
- For any use intended to or which has the effect of discriminating against or harming individuals or groups based on online or offline social behavior or known or predicted personal or personality characteristics;
|
79 |
+
- To exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
|
80 |
+
- For any use intended to or which has the effect of discriminating against individuals or groups based on legally protected characteristics or categories;
|
81 |
+
- To provide medical advice and medical results interpretation;
|
82 |
+
- To generate or disseminate information for the purpose to be used for administration of justice, law enforcement, immigration or asylum processes, such as predicting an individual will commit fraud/crime commitment (e.g. by text profiling, drawing causal relationships between assertions made in documents, indiscriminate and arbitrarily-targeted use).
|
README-orig.md
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Score Jacobian Chaining: Lifting Pretrained 2D Diffusion Models for 3D Generation
|
2 |
+
|
3 |
+
[Haochen Wang*](https://whc.is/),
|
4 |
+
[Xiaodan Du*](https://github.com/duxiaodan),
|
5 |
+
[Jiahao Li*](https://www.linkedin.com/in/jiahaoli95/),
|
6 |
+
[Raymond A. Yeh†](https://raymond-yeh.com),
|
7 |
+
[Greg Shakhnarovich](https://home.ttic.edu/~gregory/)
|
8 |
+
(* indicates equal contribution)
|
9 |
+
|
10 |
+
TTI-Chicago, †Purdue University
|
11 |
+
|
12 |
+
The repository contains Pytorch implementation of Score Jacobian Chaining: Lifting Pretrained 2D Diffusion Models for 3D Generation.
|
13 |
+
|
14 |
+
> We introduce a method that converts a pretrained 2D diffusion generative model on images into a 3D generative model of radiance fields, without requiring access to any 3D data. The key insight is to interpret diffusion models as learned predictors of a gradient field, often referred to as the score function of the data log-likelihood. We apply the chain rule on the estimated score, hence the name Score Jacobian Chaining (SJC).
|
15 |
+
|
16 |
+
<a href="https://arxiv.org/abs/2212.00774"><img src="https://img.shields.io/badge/arXiv-2212.00774-b31b1b.svg" height=22.5></a>
|
17 |
+
<a href="https://colab.research.google.com/drive/1zixo66UYGl70VOPy053o7IV_YkQt5lCZ?usp=sharing"><img src="https://colab.research.google.com/assets/colab-badge.svg" height=22.5></a>
|
18 |
+
<a href="https://pals.ttic.edu/p/score-jacobian-chaining"><img src="https://img.shields.io/website?down_color=lightgrey&down_message=offline&label=Project%20Page&up_color=lightgreen&up_message=online&url=https%3A%2F%2Fpals.ttic.edu%2Fp%2Fscore-jacobian-chaining" height=22.5></a>
|
19 |
+
|
20 |
+
<!-- [ [arxiv](https://arxiv.org/abs/2212.00774) | [project page](https://pals.ttic.edu/p/score-jacobian-chaining) | [colab](https://colab.research.google.com/drive/1zixo66UYGl70VOPy053o7IV_YkQt5lCZ?usp=sharing ) ] -->
|
21 |
+
|
22 |
+
Many thanks to [dvschultz](https://github.com/dvschultz) for the colab.
|
23 |
+
|
24 |
+
## License
|
25 |
+
Since we use Stable Diffusion, we are releasing under their OpenRAIL license. Otherwise we do not
|
26 |
+
identify any components or upstream code that carry restrictive licensing requirements.
|
27 |
+
|
28 |
+
## Structure
|
29 |
+
In addition to SJC, the repo also contains an implementation of [Karras sampler](https://arxiv.org/abs/2206.00364),
|
30 |
+
and a customized, simple voxel nerf. We provide the abstract parent class based on Karras et. al. and include
|
31 |
+
a few types of diffusion model here. See adapt.py.
|
32 |
+
|
33 |
+
## Installation
|
34 |
+
|
35 |
+
Install Pytorch according to your CUDA version, for example:
|
36 |
+
```bash
|
37 |
+
pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
|
38 |
+
```
|
39 |
+
|
40 |
+
Install other dependencies by `pip install -r requirements.txt`.
|
41 |
+
|
42 |
+
Install `taming-transformers` manually
|
43 |
+
```bash
|
44 |
+
git clone --depth 1 git@github.com:CompVis/taming-transformers.git && pip install -e taming-transformers
|
45 |
+
```
|
46 |
+
|
47 |
+
## Downloading checkpoints
|
48 |
+
We have bundled a minimal set of things you need to download (SD v1.5 ckpt, gddpm ckpt for LSUN and FFHQ)
|
49 |
+
in a tar file, made available at our download server [here](https://dl.ttic.edu/pals/sjc/release.tar).
|
50 |
+
It is a single file of 12GB, and you can use wget or curl.
|
51 |
+
|
52 |
+
Remember to __update__ `env.json` to point at the new checkpoint root where you have uncompressed the files.
|
53 |
+
|
54 |
+
## Usage
|
55 |
+
Make a new directory to run experiments (the script generates many logging files. Do not run at the root of the code repo, else risk contamination.)
|
56 |
+
```bash
|
57 |
+
mkdir exp
|
58 |
+
cd exp
|
59 |
+
```
|
60 |
+
Run the following command to generate a new 3D asset. It takes about 25 minutes on a single A5000 GPU for 10000 steps of optimization.
|
61 |
+
```bash
|
62 |
+
python /path/to/sjc/run_sjc.py \
|
63 |
+
--sd.prompt "A zoomed out high quality photo of Temple of Heaven" \
|
64 |
+
--n_steps 10000 \
|
65 |
+
--lr 0.05 \
|
66 |
+
--sd.scale 100.0 \
|
67 |
+
--emptiness_weight 10000 \
|
68 |
+
--emptiness_step 0.5 \
|
69 |
+
--emptiness_multiplier 20.0 \
|
70 |
+
--depth_weight 0 \
|
71 |
+
--var_red False
|
72 |
+
```
|
73 |
+
`sd.prompt` is the prompt to the stable diffusion model
|
74 |
+
|
75 |
+
`n_steps` is the number of gradient steps
|
76 |
+
|
77 |
+
`lr` is the base learning rate of the optimizer
|
78 |
+
|
79 |
+
`sd.scale` is the guidance scale for stable diffusion
|
80 |
+
|
81 |
+
`emptiness_weight` is the weighting factor of the emptiness loss
|
82 |
+
|
83 |
+
`emptiness_step` indicates after `emptiness_step * n_steps` update steps, the `emptiness_weight` is multiplied by `emptiness_multiplier`.
|
84 |
+
|
85 |
+
`emptiness_multipler` see above
|
86 |
+
|
87 |
+
`depth_weight` the weighting factor of the center depth loss
|
88 |
+
|
89 |
+
`var_red` whether to use Eq. 16 vs Eq. 15. For some prompts such as Obama we actually see better results with Eq. 15.
|
90 |
+
|
91 |
+
Visualization results are stored in the current directory. In directories named `test_*` there are images (under `view`) and videos (under `view_seq`) rendered at different iterations.
|
92 |
+
|
93 |
+
|
94 |
+
## TODOs
|
95 |
+
- [ ] add sub-pixel rendering script for high quality visualization such as in the teaser.
|
96 |
+
- [ ] add script to reproduce 2D experiments in Fig 4. The Fig might need change once it's tied to seeds. Note that for a simple aligned domain like faces, simple scheduling like using a single σ=1.5 could already generate some nice images. But not so for bedrooms; it's too diverse and annealing seems still needed.
|
97 |
+
|
98 |
+
## To Reproduce the Results in the Paper
|
99 |
+
First create a clean directory for your experiment, then run one of the following scripts from that folder:
|
100 |
+
### Trump
|
101 |
+
```
|
102 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "Trump figure" --n_steps 30000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
|
103 |
+
```
|
104 |
+
### Obama
|
105 |
+
```
|
106 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "Obama figure" --n_steps 30000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
|
107 |
+
```
|
108 |
+
### Biden
|
109 |
+
```
|
110 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "Biden figure" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
|
111 |
+
```
|
112 |
+
### Temple of Heaven
|
113 |
+
```
|
114 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "A zoomed out high quality photo of Temple of Heaven" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
|
115 |
+
```
|
116 |
+
### Burger
|
117 |
+
```
|
118 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "A high quality photo of a delicious burger" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
|
119 |
+
```
|
120 |
+
### Icecream
|
121 |
+
```
|
122 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "A high quality photo of a chocolate icecream cone" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 10
|
123 |
+
|
124 |
+
```
|
125 |
+
### Ficus
|
126 |
+
```
|
127 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "A ficus planted in a pot" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 100
|
128 |
+
```
|
129 |
+
### Castle
|
130 |
+
```
|
131 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "A zoomed out photo a small castle" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 50
|
132 |
+
```
|
133 |
+
### Sydney Opera House
|
134 |
+
```
|
135 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "A zoomed out high quality photo of Sydney Opera House" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
|
136 |
+
```
|
137 |
+
### Rose
|
138 |
+
```
|
139 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "a DSLR photo of a rose" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 50
|
140 |
+
```
|
141 |
+
### School Bus
|
142 |
+
```
|
143 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "A high quality photo of a yellow school bus" --n_steps 30000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0 --var_red False
|
144 |
+
```
|
145 |
+
### Rocket
|
146 |
+
```
|
147 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "A wide angle zoomed out photo of Saturn V rocket from distance" --n_steps 30000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0 --var_red False
|
148 |
+
```
|
149 |
+
### French Fries
|
150 |
+
```
|
151 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "A high quality photo of french fries from McDonald's" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 10
|
152 |
+
```
|
153 |
+
### Motorcycle
|
154 |
+
```
|
155 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "A high quality photo of a toy motorcycle" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
|
156 |
+
```
|
157 |
+
### Car
|
158 |
+
```
|
159 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "A high quality photo of a classic silver muscle car" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
|
160 |
+
```
|
161 |
+
### Tank
|
162 |
+
```
|
163 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "A product photo of a toy tank" --n_steps 20000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
|
164 |
+
```
|
165 |
+
### Chair
|
166 |
+
```
|
167 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "A high quality photo of a Victorian style wooden chair with velvet upholstery" --n_steps 50000 --lr 0.01 --sd.scale 100.0 --emptiness_weight 7000
|
168 |
+
```
|
169 |
+
### Duck
|
170 |
+
```
|
171 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "a DSLR photo of a yellow duck" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 10
|
172 |
+
```
|
173 |
+
### Horse
|
174 |
+
```
|
175 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "A photo of a horse walking" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
|
176 |
+
```
|
177 |
+
### Giraffe
|
178 |
+
```
|
179 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "A wide angle zoomed out photo of a giraffe" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 50
|
180 |
+
```
|
181 |
+
### Zebra
|
182 |
+
```
|
183 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "A photo of a zebra walking" --n_steps 10000 --lr 0.02 --sd.scale 100.0 --emptiness_weight 30000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0 --var_red False
|
184 |
+
```
|
185 |
+
### Printer
|
186 |
+
```
|
187 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "A product photo of a Canon home printer" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0 --var_red False
|
188 |
+
```
|
189 |
+
### Zelda Link
|
190 |
+
```
|
191 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "Zelda Link" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0 --var_red False
|
192 |
+
```
|
193 |
+
### Pig
|
194 |
+
```
|
195 |
+
python /path/to/sjc/run_sjc.py --sd.prompt "A pig" --n_steps 10000 --lr 0.05 --sd.scale 100.0 --emptiness_weight 10000 --emptiness_step 0.5 --emptiness_multiplier 20.0 --depth_weight 0
|
196 |
+
```
|
197 |
+
|
198 |
+
|
199 |
+
## To Test the Voxel NeRF
|
200 |
+
```
|
201 |
+
python /path/to/sjc/run_nerf.py
|
202 |
+
```
|
203 |
+
Our bundle contains a tar ball for the lego bulldozer dataset. Untar it and it will work.
|
204 |
+
|
205 |
+
## To Sample 2D images with the Karras Sampler
|
206 |
+
```
|
207 |
+
python /path/to/sjc/run_img_sampling.py
|
208 |
+
```
|
209 |
+
Use help -h to see the options available. Will expand the details later.
|
210 |
+
|
211 |
+
|
212 |
+
## Bib
|
213 |
+
```
|
214 |
+
@article{sjc,
|
215 |
+
title={Score Jacobian Chaining: Lifting Pretrained 2D Diffusion Models for 3D Generation},
|
216 |
+
author={Wang, Haochen and Du, Xiaodan and Li, Jiahao and Yeh, Raymond A. and Shakhnarovich, Greg},
|
217 |
+
journal={arXiv preprint arXiv:2212.00774},
|
218 |
+
year={2022},
|
219 |
+
}
|
220 |
+
```
|
adapt.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
import json
|
3 |
+
from math import sqrt
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
from abc import ABCMeta, abstractmethod
|
7 |
+
|
8 |
+
|
9 |
+
class ScoreAdapter(metaclass=ABCMeta):
|
10 |
+
|
11 |
+
@abstractmethod
|
12 |
+
def denoise(self, xs, σ, **kwargs):
|
13 |
+
pass
|
14 |
+
|
15 |
+
def score(self, xs, σ, **kwargs):
|
16 |
+
Ds = self.denoise(xs, σ, **kwargs)
|
17 |
+
grad_log_p_t = (Ds - xs) / (σ ** 2)
|
18 |
+
return grad_log_p_t
|
19 |
+
|
20 |
+
@abstractmethod
|
21 |
+
def data_shape(self):
|
22 |
+
return (3, 256, 256) # for example
|
23 |
+
|
24 |
+
def samps_centered(self):
|
25 |
+
# if centered, samples expected to be in range [-1, 1], else [0, 1]
|
26 |
+
return True
|
27 |
+
|
28 |
+
@property
|
29 |
+
@abstractmethod
|
30 |
+
def σ_max(self):
|
31 |
+
pass
|
32 |
+
|
33 |
+
@property
|
34 |
+
@abstractmethod
|
35 |
+
def σ_min(self):
|
36 |
+
pass
|
37 |
+
|
38 |
+
def cond_info(self, batch_size):
|
39 |
+
return {}
|
40 |
+
|
41 |
+
@abstractmethod
|
42 |
+
def unet_is_cond(self):
|
43 |
+
return False
|
44 |
+
|
45 |
+
@abstractmethod
|
46 |
+
def use_cls_guidance(self):
|
47 |
+
return False # most models do not use cls guidance
|
48 |
+
|
49 |
+
def classifier_grad(self, xs, σ, ys):
|
50 |
+
raise NotImplementedError()
|
51 |
+
|
52 |
+
@abstractmethod
|
53 |
+
def snap_t_to_nearest_tick(self, t):
|
54 |
+
# need to confirm for each model; continuous time model doesn't need this
|
55 |
+
return t, None
|
56 |
+
|
57 |
+
@property
|
58 |
+
def device(self):
|
59 |
+
return self._device
|
60 |
+
|
61 |
+
def checkpoint_root(self):
|
62 |
+
"""the path at which the pretrained checkpoints are stored"""
|
63 |
+
with Path(__file__).resolve().with_name("env.json").open("r") as f:
|
64 |
+
root = json.load(f)['data_root']
|
65 |
+
root = Path(root) / "diffusion_ckpts"
|
66 |
+
return root
|
67 |
+
|
68 |
+
|
69 |
+
def karras_t_schedule(ρ=7, N=10, σ_max=80, σ_min=0.002):
|
70 |
+
ts = []
|
71 |
+
for i in range(N):
|
72 |
+
|
73 |
+
t = (
|
74 |
+
σ_max ** (1 / ρ) + (i / (N - 1)) * (σ_min ** (1 / ρ) - σ_max ** (1 / ρ))
|
75 |
+
) ** ρ
|
76 |
+
ts.append(t)
|
77 |
+
return ts
|
78 |
+
|
79 |
+
|
80 |
+
def power_schedule(σ_max, σ_min, num_stages):
|
81 |
+
σs = np.exp(np.linspace(np.log(σ_max), np.log(σ_min), num_stages))
|
82 |
+
return σs
|
83 |
+
|
84 |
+
|
85 |
+
class Karras():
|
86 |
+
|
87 |
+
@classmethod
|
88 |
+
@torch.no_grad()
|
89 |
+
def inference(
|
90 |
+
cls, model, batch_size, num_t, *,
|
91 |
+
σ_max=80, cls_scaling=1,
|
92 |
+
init_xs=None, heun=True,
|
93 |
+
langevin=False,
|
94 |
+
S_churn=80, S_min=0.05, S_max=50, S_noise=1.003,
|
95 |
+
):
|
96 |
+
σ_max = min(σ_max, model.σ_max)
|
97 |
+
σ_min = model.σ_min
|
98 |
+
ts = karras_t_schedule(ρ=7, N=num_t, σ_max=σ_max, σ_min=σ_min)
|
99 |
+
assert len(ts) == num_t
|
100 |
+
ts = [model.snap_t_to_nearest_tick(t)[0] for t in ts]
|
101 |
+
ts.append(0) # 0 is the destination
|
102 |
+
σ_max = ts[0]
|
103 |
+
|
104 |
+
cond_inputs = model.cond_info(batch_size)
|
105 |
+
|
106 |
+
def compute_step(xs, σ):
|
107 |
+
grad_log_p_t = model.score(
|
108 |
+
xs, σ, **(cond_inputs if model.unet_is_cond() else {})
|
109 |
+
)
|
110 |
+
if model.use_cls_guidance():
|
111 |
+
grad_cls = model.classifier_grad(xs, σ, cond_inputs["y"])
|
112 |
+
grad_cls = grad_cls * cls_scaling
|
113 |
+
grad_log_p_t += grad_cls
|
114 |
+
d_i = -1 * σ * grad_log_p_t
|
115 |
+
return d_i
|
116 |
+
|
117 |
+
if init_xs is not None:
|
118 |
+
xs = init_xs.to(model.device)
|
119 |
+
else:
|
120 |
+
xs = σ_max * torch.randn(
|
121 |
+
batch_size, *model.data_shape(), device=model.device
|
122 |
+
)
|
123 |
+
|
124 |
+
yield xs
|
125 |
+
|
126 |
+
for i in range(num_t):
|
127 |
+
t_i = ts[i]
|
128 |
+
|
129 |
+
if langevin and (S_min < t_i and t_i < S_max):
|
130 |
+
xs, t_i = cls.noise_backward_in_time(
|
131 |
+
model, xs, t_i, S_noise, S_churn / num_t
|
132 |
+
)
|
133 |
+
|
134 |
+
Δt = ts[i+1] - t_i
|
135 |
+
|
136 |
+
d_1 = compute_step(xs, σ=t_i)
|
137 |
+
xs_1 = xs + Δt * d_1
|
138 |
+
|
139 |
+
# Heun's 2nd order method; don't apply on the last step
|
140 |
+
if (not heun) or (ts[i+1] == 0):
|
141 |
+
xs = xs_1
|
142 |
+
else:
|
143 |
+
d_2 = compute_step(xs_1, σ=ts[i+1])
|
144 |
+
xs = xs + Δt * (d_1 + d_2) / 2
|
145 |
+
|
146 |
+
yield xs
|
147 |
+
|
148 |
+
@staticmethod
|
149 |
+
def noise_backward_in_time(model, xs, t_i, S_noise, S_churn_i):
|
150 |
+
n = S_noise * torch.randn_like(xs)
|
151 |
+
γ_i = min(sqrt(2)-1, S_churn_i)
|
152 |
+
t_i_hat = t_i * (1 + γ_i)
|
153 |
+
t_i_hat = model.snap_t_to_nearest_tick(t_i_hat)[0]
|
154 |
+
xs = xs + n * sqrt(t_i_hat ** 2 - t_i ** 2)
|
155 |
+
return xs, t_i_hat
|
156 |
+
|
157 |
+
|
158 |
+
def test():
|
159 |
+
pass
|
160 |
+
|
161 |
+
|
162 |
+
if __name__ == "__main__":
|
163 |
+
test()
|
adapt_gddpm.py
ADDED
@@ -0,0 +1,562 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from math import sin, pi, sqrt
|
3 |
+
from functools import partial
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
import torch.nn as nn
|
8 |
+
import torch.nn.functional as F
|
9 |
+
|
10 |
+
from easydict import EasyDict
|
11 |
+
from guided_diffusion.script_util import (
|
12 |
+
create_model_and_diffusion,
|
13 |
+
model_and_diffusion_defaults,
|
14 |
+
|
15 |
+
NUM_CLASSES,
|
16 |
+
create_classifier,
|
17 |
+
classifier_defaults,
|
18 |
+
|
19 |
+
sr_create_model_and_diffusion,
|
20 |
+
sr_model_and_diffusion_defaults,
|
21 |
+
)
|
22 |
+
|
23 |
+
from adapt import ScoreAdapter
|
24 |
+
|
25 |
+
from my.registry import Registry
|
26 |
+
|
27 |
+
PRETRAINED_REGISTRY = Registry("pretrained")
|
28 |
+
|
29 |
+
|
30 |
+
device = torch.device("cuda")
|
31 |
+
|
32 |
+
|
33 |
+
def load_ckpt(path, **kwargs):
|
34 |
+
# with bf.BlobFile(path, "rb") as f:
|
35 |
+
# data = f.read()
|
36 |
+
return torch.load(path, **kwargs)
|
37 |
+
|
38 |
+
|
39 |
+
def pick_out_cfgs(src, target_ks):
|
40 |
+
return {k: src[k] for k in target_ks}
|
41 |
+
|
42 |
+
|
43 |
+
@PRETRAINED_REGISTRY.register()
|
44 |
+
def m_imgnet_64():
|
45 |
+
return dict(
|
46 |
+
attention_resolutions="32,16,8",
|
47 |
+
class_cond=True,
|
48 |
+
diffusion_steps=1000,
|
49 |
+
dropout=0.1,
|
50 |
+
image_size=64,
|
51 |
+
learn_sigma=True,
|
52 |
+
noise_schedule="cosine",
|
53 |
+
num_channels=192,
|
54 |
+
num_head_channels=64,
|
55 |
+
num_res_blocks=3,
|
56 |
+
resblock_updown=True,
|
57 |
+
use_new_attention_order=True,
|
58 |
+
use_fp16=True,
|
59 |
+
use_scale_shift_norm=True,
|
60 |
+
|
61 |
+
classifier_depth=4,
|
62 |
+
|
63 |
+
classifier_scale=1.0,
|
64 |
+
model_path="models/64x64_diffusion.pt",
|
65 |
+
classifier_path="models/64x64_classifier.pt",
|
66 |
+
)
|
67 |
+
|
68 |
+
|
69 |
+
@PRETRAINED_REGISTRY.register()
|
70 |
+
def m_imgnet_128():
|
71 |
+
return dict(
|
72 |
+
attention_resolutions="32,16,8",
|
73 |
+
class_cond=True,
|
74 |
+
diffusion_steps=1000,
|
75 |
+
image_size=128,
|
76 |
+
learn_sigma=True,
|
77 |
+
noise_schedule="linear",
|
78 |
+
num_channels=256,
|
79 |
+
num_heads=4,
|
80 |
+
num_res_blocks=2,
|
81 |
+
resblock_updown=True,
|
82 |
+
use_fp16=True,
|
83 |
+
use_scale_shift_norm=True,
|
84 |
+
|
85 |
+
classifier_scale=0.5,
|
86 |
+
model_path="models/128x128_diffusion.pt",
|
87 |
+
classifier_path="models/128x128_classifier.pt",
|
88 |
+
)
|
89 |
+
|
90 |
+
|
91 |
+
@PRETRAINED_REGISTRY.register()
|
92 |
+
def m_imgnet_256():
|
93 |
+
return dict(
|
94 |
+
attention_resolutions="32,16,8",
|
95 |
+
class_cond=True,
|
96 |
+
diffusion_steps=1000,
|
97 |
+
image_size=256,
|
98 |
+
learn_sigma=True,
|
99 |
+
noise_schedule="linear",
|
100 |
+
num_channels=256,
|
101 |
+
num_head_channels=64,
|
102 |
+
num_res_blocks=2,
|
103 |
+
resblock_updown=True,
|
104 |
+
use_fp16=True,
|
105 |
+
use_scale_shift_norm=True,
|
106 |
+
|
107 |
+
classifier_scale=1.0,
|
108 |
+
model_path="models/256x256_diffusion.pt",
|
109 |
+
classifier_path="models/256x256_classifier.pt"
|
110 |
+
)
|
111 |
+
|
112 |
+
|
113 |
+
@PRETRAINED_REGISTRY.register()
|
114 |
+
def m_imgnet_256_uncond():
|
115 |
+
return dict(
|
116 |
+
attention_resolutions="32,16,8",
|
117 |
+
class_cond=False,
|
118 |
+
diffusion_steps=1000,
|
119 |
+
image_size=256,
|
120 |
+
learn_sigma=True,
|
121 |
+
noise_schedule="linear",
|
122 |
+
num_channels=256,
|
123 |
+
num_head_channels=64,
|
124 |
+
num_res_blocks=2,
|
125 |
+
resblock_updown=True,
|
126 |
+
use_fp16=True,
|
127 |
+
use_scale_shift_norm=True,
|
128 |
+
|
129 |
+
classifier_scale=10.0,
|
130 |
+
model_path="models/256x256_diffusion_uncond.pt",
|
131 |
+
classifier_path="models/256x256_classifier.pt",
|
132 |
+
)
|
133 |
+
|
134 |
+
|
135 |
+
@PRETRAINED_REGISTRY.register()
|
136 |
+
def m_imgnet_512():
|
137 |
+
return dict(
|
138 |
+
attention_resolutions="32,16,8",
|
139 |
+
class_cond=True,
|
140 |
+
diffusion_steps=1000,
|
141 |
+
image_size=512,
|
142 |
+
learn_sigma=True,
|
143 |
+
noise_schedule="linear",
|
144 |
+
num_channels=256,
|
145 |
+
num_head_channels=64,
|
146 |
+
num_res_blocks=2,
|
147 |
+
resblock_updown=True,
|
148 |
+
use_fp16=False,
|
149 |
+
use_scale_shift_norm=True,
|
150 |
+
|
151 |
+
classifier_scale=4.0,
|
152 |
+
model_path="models/512x512_diffusion.pt",
|
153 |
+
classifier_path="models/512x512_classifier.pt"
|
154 |
+
)
|
155 |
+
|
156 |
+
|
157 |
+
@PRETRAINED_REGISTRY.register()
|
158 |
+
def m_imgnet_64_256(base_samples="64_samples.npz"):
|
159 |
+
return dict(
|
160 |
+
attention_resolutions="32,16,8",
|
161 |
+
class_cond=True,
|
162 |
+
diffusion_steps=1000,
|
163 |
+
large_size=256,
|
164 |
+
small_size=64,
|
165 |
+
learn_sigma=True,
|
166 |
+
noise_schedule="linear",
|
167 |
+
num_channels=192,
|
168 |
+
num_heads=4,
|
169 |
+
num_res_blocks=2,
|
170 |
+
resblock_updown=True,
|
171 |
+
use_fp16=True,
|
172 |
+
use_scale_shift_norm=True,
|
173 |
+
|
174 |
+
model_path="models/64_256_upsampler.pt",
|
175 |
+
|
176 |
+
base_samples=base_samples,
|
177 |
+
)
|
178 |
+
|
179 |
+
|
180 |
+
@PRETRAINED_REGISTRY.register()
|
181 |
+
def m_imgnet_128_512(base_samples="128_samples.npz",):
|
182 |
+
return dict(
|
183 |
+
attention_resolutions="32,16",
|
184 |
+
class_cond=True,
|
185 |
+
diffusion_steps=1000,
|
186 |
+
large_size=512,
|
187 |
+
small_size=128,
|
188 |
+
learn_sigma=True,
|
189 |
+
noise_schedule="linear",
|
190 |
+
num_channels=192,
|
191 |
+
num_head_channels=64,
|
192 |
+
num_res_blocks=2,
|
193 |
+
resblock_updown=True,
|
194 |
+
use_fp16=True,
|
195 |
+
use_scale_shift_norm=True,
|
196 |
+
|
197 |
+
model_path="models/128_512_upsampler.pt",
|
198 |
+
|
199 |
+
base_samples=base_samples,
|
200 |
+
)
|
201 |
+
|
202 |
+
|
203 |
+
@PRETRAINED_REGISTRY.register()
|
204 |
+
def m_lsun_256(category="bedroom"):
|
205 |
+
return dict(
|
206 |
+
attention_resolutions="32,16,8",
|
207 |
+
class_cond=False,
|
208 |
+
diffusion_steps=1000,
|
209 |
+
dropout=0.1,
|
210 |
+
image_size=256,
|
211 |
+
learn_sigma=True,
|
212 |
+
noise_schedule="linear",
|
213 |
+
num_channels=256,
|
214 |
+
num_head_channels=64,
|
215 |
+
num_res_blocks=2,
|
216 |
+
resblock_updown=True,
|
217 |
+
use_fp16=True,
|
218 |
+
use_scale_shift_norm=True,
|
219 |
+
|
220 |
+
model_path=f"models/lsun_{category}.pt"
|
221 |
+
)
|
222 |
+
|
223 |
+
|
224 |
+
def img_gen(specific_cfgs, num_samples=16, batch_size=16, load_only=False, ckpt_root=Path("")):
|
225 |
+
cfgs = EasyDict(
|
226 |
+
clip_denoised=True,
|
227 |
+
num_samples=num_samples,
|
228 |
+
batch_size=batch_size,
|
229 |
+
use_ddim=False,
|
230 |
+
model_path="",
|
231 |
+
classifier_path="",
|
232 |
+
classifier_scale=1.0,
|
233 |
+
)
|
234 |
+
cfgs.update(model_and_diffusion_defaults())
|
235 |
+
cfgs.update(classifier_defaults())
|
236 |
+
cfgs.update(specific_cfgs)
|
237 |
+
|
238 |
+
use_classifier_guidance = bool(cfgs.classifier_path)
|
239 |
+
class_aware = cfgs.class_cond or use_classifier_guidance
|
240 |
+
|
241 |
+
model, diffusion = create_model_and_diffusion(
|
242 |
+
**pick_out_cfgs(cfgs, model_and_diffusion_defaults().keys())
|
243 |
+
)
|
244 |
+
model.load_state_dict(
|
245 |
+
load_ckpt(str(ckpt_root / cfgs.model_path), map_location="cpu")
|
246 |
+
)
|
247 |
+
model.to(device)
|
248 |
+
if cfgs.use_fp16:
|
249 |
+
model.convert_to_fp16()
|
250 |
+
model.eval()
|
251 |
+
|
252 |
+
def model_fn(x, t, y=None):
|
253 |
+
return model(x, t, y if cfgs.class_cond else None)
|
254 |
+
|
255 |
+
classifier = None
|
256 |
+
cond_fn = None
|
257 |
+
if use_classifier_guidance:
|
258 |
+
classifier = create_classifier(
|
259 |
+
**pick_out_cfgs(cfgs, classifier_defaults().keys())
|
260 |
+
)
|
261 |
+
classifier.load_state_dict(
|
262 |
+
load_ckpt(str(ckpt_root / cfgs.classifier_path), map_location="cpu")
|
263 |
+
)
|
264 |
+
classifier.to(device)
|
265 |
+
if cfgs.classifier_use_fp16:
|
266 |
+
classifier.convert_to_fp16()
|
267 |
+
classifier.eval()
|
268 |
+
|
269 |
+
def cond_fn(x, t, y=None):
|
270 |
+
assert y is not None
|
271 |
+
with torch.enable_grad():
|
272 |
+
x_in = x.detach().requires_grad_(True)
|
273 |
+
logits = classifier(x_in, t)
|
274 |
+
log_probs = F.log_softmax(logits, dim=-1)
|
275 |
+
selected = log_probs[range(len(logits)), y.view(-1)]
|
276 |
+
return torch.autograd.grad(selected.sum(), x_in)[0] * cfgs.classifier_scale
|
277 |
+
|
278 |
+
if load_only:
|
279 |
+
return model, classifier
|
280 |
+
|
281 |
+
all_images = []
|
282 |
+
all_labels = []
|
283 |
+
|
284 |
+
while len(all_images) * cfgs.batch_size < cfgs.num_samples:
|
285 |
+
model_kwargs = {}
|
286 |
+
|
287 |
+
if class_aware:
|
288 |
+
classes = torch.randint(
|
289 |
+
low=0, high=NUM_CLASSES, size=(cfgs.batch_size,), device=device
|
290 |
+
)
|
291 |
+
model_kwargs["y"] = classes
|
292 |
+
|
293 |
+
sample_fn = (
|
294 |
+
diffusion.p_sample_loop if not cfgs.use_ddim else diffusion.ddim_sample_loop
|
295 |
+
)
|
296 |
+
sample = sample_fn(
|
297 |
+
model_fn,
|
298 |
+
(cfgs.batch_size, 3, cfgs.image_size, cfgs.image_size),
|
299 |
+
clip_denoised=cfgs.clip_denoised,
|
300 |
+
model_kwargs=model_kwargs,
|
301 |
+
cond_fn=cond_fn,
|
302 |
+
device=device,
|
303 |
+
progress=True
|
304 |
+
)
|
305 |
+
sample = ((sample + 1) * 127.5).clamp(0, 255).to(torch.uint8)
|
306 |
+
sample = sample.permute(0, 2, 3, 1)
|
307 |
+
sample = sample.contiguous()
|
308 |
+
|
309 |
+
all_images.append(sample.cpu().numpy())
|
310 |
+
if class_aware:
|
311 |
+
all_labels.append(classes.cpu().numpy())
|
312 |
+
|
313 |
+
arr = np.concatenate(all_images, axis=0)
|
314 |
+
arr = arr[:cfgs.num_samples]
|
315 |
+
|
316 |
+
if class_aware:
|
317 |
+
all_labels = np.concatenate(all_labels, axis=0)
|
318 |
+
all_labels = all_labels[:cfgs.num_samples]
|
319 |
+
|
320 |
+
shape_str = "x".join([str(x) for x in arr.shape])
|
321 |
+
out_path = Path("./out") / f"samples_{shape_str}.npz"
|
322 |
+
np.savez(out_path, arr, all_labels)
|
323 |
+
|
324 |
+
|
325 |
+
def img_upsamp(specific_cfgs, num_samples=16, batch_size=16, load_only=False):
|
326 |
+
"""note that here the ckpt root is not configured properly; will break but easy fix"""
|
327 |
+
cfgs = EasyDict(
|
328 |
+
clip_denoised=True,
|
329 |
+
num_samples=num_samples,
|
330 |
+
batch_size=batch_size,
|
331 |
+
use_ddim=False,
|
332 |
+
base_samples="",
|
333 |
+
model_path="",
|
334 |
+
)
|
335 |
+
cfgs.update(sr_model_and_diffusion_defaults())
|
336 |
+
cfgs.update(specific_cfgs)
|
337 |
+
|
338 |
+
model, diffusion = sr_create_model_and_diffusion(
|
339 |
+
**pick_out_cfgs(cfgs, sr_model_and_diffusion_defaults().keys())
|
340 |
+
)
|
341 |
+
model.load_state_dict(load_ckpt(cfgs.model_path, map_location="cpu"))
|
342 |
+
model.to(device)
|
343 |
+
if cfgs.use_fp16:
|
344 |
+
model.convert_to_fp16()
|
345 |
+
model.eval()
|
346 |
+
|
347 |
+
if load_only:
|
348 |
+
return model
|
349 |
+
|
350 |
+
data = load_low_res_samples(
|
351 |
+
cfgs.base_samples, cfgs.batch_size, cfgs.class_cond
|
352 |
+
)
|
353 |
+
|
354 |
+
all_images = []
|
355 |
+
while len(all_images) * cfgs.batch_size < cfgs.num_samples:
|
356 |
+
model_kwargs = next(data)
|
357 |
+
model_kwargs = {k: v.to(device) for k, v in model_kwargs.items()}
|
358 |
+
samples = diffusion.p_sample_loop(
|
359 |
+
model,
|
360 |
+
(cfgs.batch_size, 3, cfgs.large_size, cfgs.large_size),
|
361 |
+
clip_denoised=cfgs.clip_denoised,
|
362 |
+
model_kwargs=model_kwargs,
|
363 |
+
progress=True
|
364 |
+
)
|
365 |
+
samples = ((samples + 1) * 127.5).clamp(0, 255).to(torch.uint8)
|
366 |
+
samples = samples.permute(0, 2, 3, 1)
|
367 |
+
samples = samples.contiguous()
|
368 |
+
|
369 |
+
all_images.append(samples.cpu().numpy())
|
370 |
+
|
371 |
+
arr = np.concatenate(all_images, axis=0)
|
372 |
+
arr = arr[: cfgs.num_samples]
|
373 |
+
|
374 |
+
shape_str = "x".join([str(x) for x in arr.shape])
|
375 |
+
out_path = Path("./out") / f"samples_{shape_str}.npz"
|
376 |
+
np.savez(out_path, arr)
|
377 |
+
|
378 |
+
|
379 |
+
def load_low_res_samples(base_samples, batch_size, class_cond):
|
380 |
+
obj = np.load(base_samples)
|
381 |
+
image_arr = obj["arr_0"]
|
382 |
+
if class_cond:
|
383 |
+
label_arr = obj["arr_1"]
|
384 |
+
|
385 |
+
buffer = []
|
386 |
+
label_buffer = []
|
387 |
+
while True:
|
388 |
+
for i in range(len(image_arr)):
|
389 |
+
buffer.append(image_arr[i])
|
390 |
+
if class_cond:
|
391 |
+
label_buffer.append(label_arr[i])
|
392 |
+
|
393 |
+
if len(buffer) == batch_size:
|
394 |
+
batch = torch.from_numpy(np.stack(buffer)).float()
|
395 |
+
batch = batch / 127.5 - 1.0
|
396 |
+
batch = batch.permute(0, 3, 1, 2)
|
397 |
+
res = {}
|
398 |
+
res["low_res"] = batch
|
399 |
+
if class_cond:
|
400 |
+
res["y"] = torch.from_numpy(np.stack(label_buffer))
|
401 |
+
yield res
|
402 |
+
buffer, label_buffer = [], []
|
403 |
+
|
404 |
+
|
405 |
+
def class_cond_info(imgnet_cat):
|
406 |
+
|
407 |
+
def rand_cond_fn(batch_size):
|
408 |
+
cats = torch.randint(
|
409 |
+
low=0, high=NUM_CLASSES, size=(batch_size,), device=device
|
410 |
+
)
|
411 |
+
return {"y": cats}
|
412 |
+
|
413 |
+
def class_specific_cond(batch_size):
|
414 |
+
cats = torch.tensor([imgnet_cat, ] * batch_size, device=device)
|
415 |
+
return {"y": cats}
|
416 |
+
|
417 |
+
if imgnet_cat == -1:
|
418 |
+
return rand_cond_fn
|
419 |
+
else:
|
420 |
+
return class_specific_cond
|
421 |
+
|
422 |
+
|
423 |
+
def _sqrt(x):
|
424 |
+
if isinstance(x, float):
|
425 |
+
return sqrt(x)
|
426 |
+
else:
|
427 |
+
assert isinstance(x, torch.Tensor)
|
428 |
+
return torch.sqrt(x)
|
429 |
+
|
430 |
+
|
431 |
+
class GuidedDDPM(ScoreAdapter):
|
432 |
+
def __init__(self, model, lsun_cat, imgnet_cat):
|
433 |
+
print(PRETRAINED_REGISTRY)
|
434 |
+
cfgs = PRETRAINED_REGISTRY.get(model)(
|
435 |
+
**({"category": lsun_cat} if model.startswith("m_lsun") else {})
|
436 |
+
)
|
437 |
+
|
438 |
+
self.unet, self.classifier = img_gen(
|
439 |
+
cfgs, load_only=True, ckpt_root=self.checkpoint_root() / "guided_ddpm"
|
440 |
+
)
|
441 |
+
|
442 |
+
H, W = cfgs['image_size'], cfgs['image_size']
|
443 |
+
self._data_shape = (3, H, W)
|
444 |
+
|
445 |
+
if cfgs['class_cond'] or (self.classifier is not None):
|
446 |
+
cond_func = class_cond_info(imgnet_cat)
|
447 |
+
else:
|
448 |
+
cond_func = lambda *args, **kwargs: {}
|
449 |
+
self.cond_func = cond_func
|
450 |
+
|
451 |
+
self._unet_is_cond = bool(cfgs['class_cond'])
|
452 |
+
|
453 |
+
noise_schedule = cfgs['noise_schedule']
|
454 |
+
assert noise_schedule in ("linear", "cosine")
|
455 |
+
self.M = 1000
|
456 |
+
if noise_schedule == "linear":
|
457 |
+
self.us = self.linear_us(self.M)
|
458 |
+
self._σ_min = 0.01
|
459 |
+
else:
|
460 |
+
self.us = self.cosine_us(self.M)
|
461 |
+
self._σ_min = 0.0064
|
462 |
+
self.noise_schedule = noise_schedule
|
463 |
+
|
464 |
+
self._device = next(self.unet.parameters()).device
|
465 |
+
|
466 |
+
def data_shape(self):
|
467 |
+
return self._data_shape
|
468 |
+
|
469 |
+
@property
|
470 |
+
def σ_max(self):
|
471 |
+
return self.us[0]
|
472 |
+
|
473 |
+
@property
|
474 |
+
def σ_min(self):
|
475 |
+
return self.us[-1]
|
476 |
+
|
477 |
+
@torch.no_grad()
|
478 |
+
def denoise(self, xs, σ, **model_kwargs):
|
479 |
+
N = xs.shape[0]
|
480 |
+
cond_t, σ = self.time_cond_vec(N, σ)
|
481 |
+
output = self.unet(
|
482 |
+
xs / _sqrt(1 + σ**2), cond_t, **model_kwargs
|
483 |
+
)
|
484 |
+
# not using the var pred
|
485 |
+
n_hat = torch.split(output, xs.shape[1], dim=1)[0]
|
486 |
+
Ds = xs - σ * n_hat
|
487 |
+
return Ds
|
488 |
+
|
489 |
+
def cond_info(self, batch_size):
|
490 |
+
return self.cond_func(batch_size)
|
491 |
+
|
492 |
+
def unet_is_cond(self):
|
493 |
+
return self._unet_is_cond
|
494 |
+
|
495 |
+
def use_cls_guidance(self):
|
496 |
+
return (self.classifier is not None)
|
497 |
+
|
498 |
+
@torch.no_grad()
|
499 |
+
def classifier_grad(self, xs, σ, ys):
|
500 |
+
N = xs.shape[0]
|
501 |
+
cond_t, σ = self.time_cond_vec(N, σ)
|
502 |
+
with torch.enable_grad():
|
503 |
+
x_in = xs.detach().requires_grad_(True)
|
504 |
+
logits = self.classifier(x_in, cond_t)
|
505 |
+
log_probs = F.log_softmax(logits, dim=-1)
|
506 |
+
selected = log_probs[range(len(logits)), ys.view(-1)]
|
507 |
+
grad = torch.autograd.grad(selected.sum(), x_in)[0]
|
508 |
+
|
509 |
+
grad = grad * (1 / sqrt(1 + σ**2))
|
510 |
+
return grad
|
511 |
+
|
512 |
+
def snap_t_to_nearest_tick(self, t):
|
513 |
+
j = np.abs(t - self.us).argmin()
|
514 |
+
return self.us[j], j
|
515 |
+
|
516 |
+
def time_cond_vec(self, N, σ):
|
517 |
+
if isinstance(σ, float):
|
518 |
+
σ, j = self.snap_t_to_nearest_tick(σ) # σ might change due to snapping
|
519 |
+
cond_t = (self.M - 1) - j
|
520 |
+
cond_t = torch.tensor([cond_t] * N, device=self.device)
|
521 |
+
return cond_t, σ
|
522 |
+
else:
|
523 |
+
assert isinstance(σ, torch.Tensor)
|
524 |
+
σ = σ.reshape(-1).cpu().numpy()
|
525 |
+
σs = []
|
526 |
+
js = []
|
527 |
+
for elem in σ:
|
528 |
+
_σ, _j = self.snap_t_to_nearest_tick(elem)
|
529 |
+
σs.append(_σ)
|
530 |
+
js.append((self.M - 1) - _j)
|
531 |
+
|
532 |
+
cond_t = torch.tensor(js, device=self.device)
|
533 |
+
σs = torch.tensor(σs, device=self.device, dtype=torch.float32).reshape(-1, 1, 1, 1)
|
534 |
+
return cond_t, σs
|
535 |
+
|
536 |
+
@staticmethod
|
537 |
+
def cosine_us(M=1000):
|
538 |
+
assert M == 1000
|
539 |
+
|
540 |
+
def α_bar(j):
|
541 |
+
return sin(pi / 2 * j / (M * (0.008 + 1))) ** 2
|
542 |
+
|
543 |
+
us = [0, ]
|
544 |
+
for j in reversed(range(0, M)): # [M-1, 0], inclusive
|
545 |
+
u_j = sqrt(((us[-1] ** 2) + 1) / (max(α_bar(j) / α_bar(j+1), 0.001)) - 1)
|
546 |
+
us.append(u_j)
|
547 |
+
|
548 |
+
us = np.array(us)
|
549 |
+
us = us[1:]
|
550 |
+
us = us[::-1]
|
551 |
+
return us
|
552 |
+
|
553 |
+
@staticmethod
|
554 |
+
def linear_us(M=1000):
|
555 |
+
assert M == 1000
|
556 |
+
β_start = 0.0001
|
557 |
+
β_end = 0.02
|
558 |
+
βs = np.linspace(β_start, β_end, M, dtype=np.float64)
|
559 |
+
αs = np.cumprod(1 - βs)
|
560 |
+
us = np.sqrt((1 - αs) / αs)
|
561 |
+
us = us[::-1]
|
562 |
+
return us
|
adapt_ncsn.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
import argparse
|
3 |
+
import yaml
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
|
8 |
+
from ncsn.ncsnv2 import NCSNv2, NCSNv2Deeper, NCSNv2Deepest, get_sigmas
|
9 |
+
from ncsn.ema import EMAHelper
|
10 |
+
|
11 |
+
from adapt import ScoreAdapter
|
12 |
+
|
13 |
+
device = torch.device("cuda")
|
14 |
+
|
15 |
+
|
16 |
+
def get_model(config):
|
17 |
+
if config.data.dataset == 'CIFAR10' or config.data.dataset == 'CELEBA':
|
18 |
+
return NCSNv2(config).to(config.device)
|
19 |
+
elif config.data.dataset == "FFHQ":
|
20 |
+
return NCSNv2Deepest(config).to(config.device)
|
21 |
+
elif config.data.dataset == 'LSUN':
|
22 |
+
return NCSNv2Deeper(config).to(config.device)
|
23 |
+
|
24 |
+
|
25 |
+
def dict2namespace(config):
|
26 |
+
namespace = argparse.Namespace()
|
27 |
+
for key, value in config.items():
|
28 |
+
if isinstance(value, dict):
|
29 |
+
new_value = dict2namespace(value)
|
30 |
+
else:
|
31 |
+
new_value = value
|
32 |
+
setattr(namespace, key, new_value)
|
33 |
+
return namespace
|
34 |
+
|
35 |
+
|
36 |
+
class NCSN(ScoreAdapter):
|
37 |
+
def __init__(self):
|
38 |
+
config_fname = Path(__file__).resolve().parent / "ncsn" / "bedroom.yml"
|
39 |
+
with config_fname.open("r") as f:
|
40 |
+
config = yaml.safe_load(f)
|
41 |
+
config = dict2namespace(config)
|
42 |
+
|
43 |
+
config.device = device
|
44 |
+
|
45 |
+
states = torch.load(
|
46 |
+
self.checkpoint_root() / "ncsn/exp/logs/bedroom/checkpoint_150000.pth"
|
47 |
+
)
|
48 |
+
|
49 |
+
model = get_model(config)
|
50 |
+
model = torch.nn.DataParallel(model)
|
51 |
+
model.load_state_dict(states[0], strict=True)
|
52 |
+
|
53 |
+
if config.model.ema:
|
54 |
+
ema_helper = EMAHelper(mu=config.model.ema_rate)
|
55 |
+
ema_helper.register(model)
|
56 |
+
ema_helper.load_state_dict(states[-1])
|
57 |
+
# HC: update the model param with history ema.
|
58 |
+
# if don't do this the colors of images become strangely saturated.
|
59 |
+
# this is reported in the paper.
|
60 |
+
ema_helper.ema(model)
|
61 |
+
|
62 |
+
model = model.module # remove DataParallel
|
63 |
+
model.eval()
|
64 |
+
self.model = model
|
65 |
+
self._data_shape = (3, config.data.image_size, config.data.image_size)
|
66 |
+
|
67 |
+
self.σs = model.sigmas.cpu().numpy()
|
68 |
+
self._device = device
|
69 |
+
|
70 |
+
def data_shape(self):
|
71 |
+
return self._data_shape
|
72 |
+
|
73 |
+
def samps_centered(self):
|
74 |
+
return False
|
75 |
+
|
76 |
+
@property
|
77 |
+
def σ_max(self):
|
78 |
+
return self.σs[0]
|
79 |
+
|
80 |
+
@property
|
81 |
+
def σ_min(self):
|
82 |
+
return self.σs[-1]
|
83 |
+
|
84 |
+
@torch.no_grad()
|
85 |
+
def denoise(self, xs, σ):
|
86 |
+
σ, j = self.snap_t_to_nearest_tick(σ)
|
87 |
+
N = xs.shape[0]
|
88 |
+
cond_t = torch.tensor([j] * N, dtype=torch.long, device=self.device)
|
89 |
+
score = self.model(xs, cond_t)
|
90 |
+
Ds = xs + score * (σ ** 2)
|
91 |
+
return Ds
|
92 |
+
|
93 |
+
def unet_is_cond(self):
|
94 |
+
return False
|
95 |
+
|
96 |
+
def use_cls_guidance(self):
|
97 |
+
return False
|
98 |
+
|
99 |
+
def snap_t_to_nearest_tick(self, t):
|
100 |
+
j = np.abs(t - self.σs).argmin()
|
101 |
+
return self.σs[j], j
|
adapt_sd.py
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from pathlib import Path
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
from omegaconf import OmegaConf
|
6 |
+
from einops import rearrange
|
7 |
+
|
8 |
+
from torch import autocast
|
9 |
+
from contextlib import nullcontext
|
10 |
+
from math import sqrt
|
11 |
+
from adapt import ScoreAdapter
|
12 |
+
|
13 |
+
import warnings
|
14 |
+
from transformers import logging
|
15 |
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
16 |
+
logging.set_verbosity_error()
|
17 |
+
|
18 |
+
|
19 |
+
device = torch.device("cuda")
|
20 |
+
|
21 |
+
|
22 |
+
def curr_dir():
|
23 |
+
return Path(__file__).resolve().parent
|
24 |
+
|
25 |
+
|
26 |
+
def add_import_path(dirname):
|
27 |
+
sys.path.append(str(
|
28 |
+
curr_dir() / str(dirname)
|
29 |
+
))
|
30 |
+
|
31 |
+
|
32 |
+
def load_model_from_config(config, ckpt, verbose=False):
|
33 |
+
from ldm.util import instantiate_from_config
|
34 |
+
print(f"Loading model from {ckpt}")
|
35 |
+
pl_sd = torch.load(ckpt, map_location="cpu")
|
36 |
+
if "global_step" in pl_sd:
|
37 |
+
print(f"Global Step: {pl_sd['global_step']}")
|
38 |
+
sd = pl_sd["state_dict"]
|
39 |
+
model = instantiate_from_config(config.model)
|
40 |
+
m, u = model.load_state_dict(sd, strict=False)
|
41 |
+
if len(m) > 0 and verbose:
|
42 |
+
print("missing keys:")
|
43 |
+
print(m)
|
44 |
+
if len(u) > 0 and verbose:
|
45 |
+
print("unexpected keys:")
|
46 |
+
print(u)
|
47 |
+
|
48 |
+
model.to(device)
|
49 |
+
model.eval()
|
50 |
+
return model
|
51 |
+
|
52 |
+
|
53 |
+
def load_sd1_model(ckpt_root):
|
54 |
+
ckpt_fname = ckpt_root / "stable_diffusion" / "sd-v1-5.ckpt"
|
55 |
+
cfg_fname = curr_dir() / "sd1" / "configs" / "v1-inference.yaml"
|
56 |
+
H, W = 512, 512
|
57 |
+
|
58 |
+
config = OmegaConf.load(str(cfg_fname))
|
59 |
+
model = load_model_from_config(config, str(ckpt_fname))
|
60 |
+
return model, H, W
|
61 |
+
|
62 |
+
|
63 |
+
def load_sd2_model(ckpt_root, v2_highres):
|
64 |
+
if v2_highres:
|
65 |
+
ckpt_fname = ckpt_root / "sd2" / "768-v-ema.ckpt"
|
66 |
+
cfg_fname = curr_dir() / "sd2/configs/stable-diffusion/v2-inference-v.yaml"
|
67 |
+
H, W = 768, 768
|
68 |
+
else:
|
69 |
+
ckpt_fname = ckpt_root / "sd2" / "512-base-ema.ckpt"
|
70 |
+
cfg_fname = curr_dir() / "sd2/configs/stable-diffusion/v2-inference.yaml"
|
71 |
+
H, W = 512, 512
|
72 |
+
|
73 |
+
config = OmegaConf.load(f"{cfg_fname}")
|
74 |
+
model = load_model_from_config(config, str(ckpt_fname))
|
75 |
+
return model, H, W
|
76 |
+
|
77 |
+
|
78 |
+
def _sqrt(x):
|
79 |
+
if isinstance(x, float):
|
80 |
+
return sqrt(x)
|
81 |
+
else:
|
82 |
+
assert isinstance(x, torch.Tensor)
|
83 |
+
return torch.sqrt(x)
|
84 |
+
|
85 |
+
|
86 |
+
class StableDiffusion(ScoreAdapter):
|
87 |
+
def __init__(self, variant, v2_highres, prompt, scale, precision):
|
88 |
+
if variant == "v1":
|
89 |
+
add_import_path("sd1")
|
90 |
+
self.model, H, W = load_sd1_model(self.checkpoint_root())
|
91 |
+
elif variant == "v2":
|
92 |
+
add_import_path("sd2")
|
93 |
+
self.model, H, W = load_sd2_model(self.checkpoint_root(), v2_highres)
|
94 |
+
else:
|
95 |
+
raise ValueError(f"{variant}")
|
96 |
+
|
97 |
+
ae_resolution_f = 8
|
98 |
+
|
99 |
+
self._device = self.model._device
|
100 |
+
|
101 |
+
self.prompt = prompt
|
102 |
+
self.scale = scale
|
103 |
+
self.precision = precision
|
104 |
+
self.precision_scope = autocast if self.precision == "autocast" else nullcontext
|
105 |
+
self._data_shape = (4, H // ae_resolution_f, W // ae_resolution_f)
|
106 |
+
|
107 |
+
self.cond_func = self.model.get_learned_conditioning
|
108 |
+
self.M = 1000
|
109 |
+
noise_schedule = "linear"
|
110 |
+
self.noise_schedule = noise_schedule
|
111 |
+
self.us = self.linear_us(self.M)
|
112 |
+
|
113 |
+
def data_shape(self):
|
114 |
+
return self._data_shape
|
115 |
+
|
116 |
+
@property
|
117 |
+
def σ_max(self):
|
118 |
+
return self.us[0]
|
119 |
+
|
120 |
+
@property
|
121 |
+
def σ_min(self):
|
122 |
+
return self.us[-1]
|
123 |
+
|
124 |
+
@torch.no_grad()
|
125 |
+
def denoise(self, xs, σ, **model_kwargs):
|
126 |
+
with self.precision_scope("cuda"):
|
127 |
+
with self.model.ema_scope():
|
128 |
+
N = xs.shape[0]
|
129 |
+
c = model_kwargs.pop('c')
|
130 |
+
uc = model_kwargs.pop('uc')
|
131 |
+
cond_t, σ = self.time_cond_vec(N, σ)
|
132 |
+
unscaled_xs = xs
|
133 |
+
xs = xs / _sqrt(1 + σ**2)
|
134 |
+
if uc is None or self.scale == 1.:
|
135 |
+
output = self.model.apply_model(xs, cond_t, c)
|
136 |
+
else:
|
137 |
+
x_in = torch.cat([xs] * 2)
|
138 |
+
t_in = torch.cat([cond_t] * 2)
|
139 |
+
c_in = torch.cat([uc, c])
|
140 |
+
e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
|
141 |
+
output = e_t_uncond + self.scale * (e_t - e_t_uncond)
|
142 |
+
|
143 |
+
if self.model.parameterization == "v":
|
144 |
+
output = self.model.predict_eps_from_z_and_v(xs, cond_t, output)
|
145 |
+
else:
|
146 |
+
output = output
|
147 |
+
|
148 |
+
Ds = unscaled_xs - σ * output
|
149 |
+
return Ds
|
150 |
+
|
151 |
+
def cond_info(self, batch_size):
|
152 |
+
prompts = batch_size * [self.prompt]
|
153 |
+
return self.prompts_emb(prompts)
|
154 |
+
|
155 |
+
@torch.no_grad()
|
156 |
+
def prompts_emb(self, prompts):
|
157 |
+
assert isinstance(prompts, list)
|
158 |
+
batch_size = len(prompts)
|
159 |
+
with self.precision_scope("cuda"):
|
160 |
+
with self.model.ema_scope():
|
161 |
+
cond = {}
|
162 |
+
c = self.cond_func(prompts)
|
163 |
+
cond['c'] = c
|
164 |
+
uc = None
|
165 |
+
if self.scale != 1.0:
|
166 |
+
uc = self.cond_func(batch_size * [""])
|
167 |
+
cond['uc'] = uc
|
168 |
+
return cond
|
169 |
+
|
170 |
+
def unet_is_cond(self):
|
171 |
+
return True
|
172 |
+
|
173 |
+
def use_cls_guidance(self):
|
174 |
+
return False
|
175 |
+
|
176 |
+
def snap_t_to_nearest_tick(self, t):
|
177 |
+
j = np.abs(t - self.us).argmin()
|
178 |
+
return self.us[j], j
|
179 |
+
|
180 |
+
def time_cond_vec(self, N, σ):
|
181 |
+
if isinstance(σ, float):
|
182 |
+
σ, j = self.snap_t_to_nearest_tick(σ) # σ might change due to snapping
|
183 |
+
cond_t = (self.M - 1) - j
|
184 |
+
cond_t = torch.tensor([cond_t] * N, device=self.device)
|
185 |
+
return cond_t, σ
|
186 |
+
else:
|
187 |
+
assert isinstance(σ, torch.Tensor)
|
188 |
+
σ = σ.reshape(-1).cpu().numpy()
|
189 |
+
σs = []
|
190 |
+
js = []
|
191 |
+
for elem in σ:
|
192 |
+
_σ, _j = self.snap_t_to_nearest_tick(elem)
|
193 |
+
σs.append(_σ)
|
194 |
+
js.append((self.M - 1) - _j)
|
195 |
+
|
196 |
+
cond_t = torch.tensor(js, device=self.device)
|
197 |
+
σs = torch.tensor(σs, device=self.device, dtype=torch.float32).reshape(-1, 1, 1, 1)
|
198 |
+
return cond_t, σs
|
199 |
+
|
200 |
+
@staticmethod
|
201 |
+
def linear_us(M=1000):
|
202 |
+
assert M == 1000
|
203 |
+
β_start = 0.00085
|
204 |
+
β_end = 0.0120
|
205 |
+
βs = np.linspace(β_start**0.5, β_end**0.5, M, dtype=np.float64)**2
|
206 |
+
αs = np.cumprod(1 - βs)
|
207 |
+
us = np.sqrt((1 - αs) / αs)
|
208 |
+
us = us[::-1]
|
209 |
+
return us
|
210 |
+
|
211 |
+
@torch.no_grad()
|
212 |
+
def encode(self, xs):
|
213 |
+
model = self.model
|
214 |
+
with self.precision_scope("cuda"):
|
215 |
+
with self.model.ema_scope():
|
216 |
+
zs = model.get_first_stage_encoding(
|
217 |
+
model.encode_first_stage(xs)
|
218 |
+
)
|
219 |
+
return zs
|
220 |
+
|
221 |
+
@torch.no_grad()
|
222 |
+
def decode(self, xs):
|
223 |
+
with self.precision_scope("cuda"):
|
224 |
+
with self.model.ema_scope():
|
225 |
+
xs = self.model.decode_first_stage(xs)
|
226 |
+
return xs
|
227 |
+
|
228 |
+
|
229 |
+
def test():
|
230 |
+
sd = StableDiffusion("v2", True, "haha", 10.0, "autocast")
|
231 |
+
print(sd)
|
232 |
+
|
233 |
+
|
234 |
+
if __name__ == "__main__":
|
235 |
+
test()
|
adapt_vesde.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
import torch
|
3 |
+
from ml_collections.config_flags import config_flags
|
4 |
+
|
5 |
+
from sde.config import get_config
|
6 |
+
from sde import ddpm, ncsnv2, ncsnpp # need to import to trigger its registry
|
7 |
+
from sde import utils as mutils
|
8 |
+
from sde.ema import ExponentialMovingAverage
|
9 |
+
|
10 |
+
from adapt import ScoreAdapter
|
11 |
+
|
12 |
+
device = torch.device("cuda")
|
13 |
+
|
14 |
+
|
15 |
+
def restore_checkpoint(ckpt_dir, state, device):
|
16 |
+
loaded_state = torch.load(ckpt_dir, map_location=device)
|
17 |
+
# state['optimizer'].load_state_dict(loaded_state['optimizer'])
|
18 |
+
state['model'].load_state_dict(loaded_state['model'], strict=False)
|
19 |
+
state['ema'].load_state_dict(loaded_state['ema'])
|
20 |
+
state['step'] = loaded_state['step']
|
21 |
+
return state
|
22 |
+
|
23 |
+
|
24 |
+
def save_checkpoint(ckpt_dir, state):
|
25 |
+
saved_state = {
|
26 |
+
'optimizer': state['optimizer'].state_dict(),
|
27 |
+
'model': state['model'].state_dict(),
|
28 |
+
'ema': state['ema'].state_dict(),
|
29 |
+
'step': state['step']
|
30 |
+
}
|
31 |
+
torch.save(saved_state, ckpt_dir)
|
32 |
+
|
33 |
+
|
34 |
+
class VESDE(ScoreAdapter):
|
35 |
+
def __init__(self):
|
36 |
+
config = get_config()
|
37 |
+
config.device = device
|
38 |
+
ckpt_fname = self.checkpoint_root() / "sde" / 'checkpoint_127.pth'
|
39 |
+
|
40 |
+
score_model = mutils.create_model(config)
|
41 |
+
ema = ExponentialMovingAverage(
|
42 |
+
score_model.parameters(), decay=config.model.ema_rate
|
43 |
+
)
|
44 |
+
state = dict(model=score_model, ema=ema, step=0)
|
45 |
+
self._data_shape = (
|
46 |
+
config.data.num_channels, config.data.image_size, config.data.image_size
|
47 |
+
)
|
48 |
+
|
49 |
+
self._σ_min = float(config.model.sigma_min * 2)
|
50 |
+
|
51 |
+
state = restore_checkpoint(ckpt_fname, state, device=config.device)
|
52 |
+
ema.copy_to(score_model.parameters())
|
53 |
+
|
54 |
+
score_model.eval()
|
55 |
+
score_model = score_model.module # remove DataParallel
|
56 |
+
|
57 |
+
self.model = score_model
|
58 |
+
self._device = device
|
59 |
+
|
60 |
+
def data_shape(self):
|
61 |
+
return self._data_shape
|
62 |
+
|
63 |
+
@property
|
64 |
+
def σ_min(self):
|
65 |
+
return self._σ_min
|
66 |
+
|
67 |
+
@torch.no_grad()
|
68 |
+
def denoise(self, xs, σ):
|
69 |
+
N = xs.shape[0]
|
70 |
+
# see Karras eqn. 212-215 for the 1/2 σ correction
|
71 |
+
cond_t = (0.5 * σ) * torch.ones(N, device=self.device)
|
72 |
+
# note that the forward function the model has been modified; see comments
|
73 |
+
n_hat = self.model(xs, cond_t)
|
74 |
+
Ds = xs + σ * n_hat
|
75 |
+
return Ds
|
76 |
+
|
77 |
+
def unet_is_cond(self):
|
78 |
+
return False
|
79 |
+
|
80 |
+
def use_cls_guidance(self):
|
81 |
+
return False
|
82 |
+
|
83 |
+
def snap_t_to_nearest_tick(self, t):
|
84 |
+
return super().snap_t_to_nearest_tick(t)
|
app.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
|
4 |
+
from my.utils import tqdm
|
5 |
+
from my.utils.seed import seed_everything
|
6 |
+
|
7 |
+
from run_img_sampling import SD, StableDiffusion
|
8 |
+
from misc import torch_samps_to_imgs
|
9 |
+
from pose import PoseConfig
|
10 |
+
|
11 |
+
from run_nerf import VoxConfig
|
12 |
+
from voxnerf.utils import every
|
13 |
+
from voxnerf.vis import stitch_vis, bad_vis as nerf_vis
|
14 |
+
|
15 |
+
from run_sjc import render_one_view
|
16 |
+
|
17 |
+
device_glb = torch.device("cuda")
|
18 |
+
|
19 |
+
@torch.no_grad()
|
20 |
+
def evaluate(score_model, vox, poser):
|
21 |
+
H, W = poser.H, poser.W
|
22 |
+
vox.eval()
|
23 |
+
K, poses = poser.sample_test(100)
|
24 |
+
|
25 |
+
aabb = vox.aabb.T.cpu().numpy()
|
26 |
+
vox = vox.to(device_glb)
|
27 |
+
|
28 |
+
num_imgs = len(poses)
|
29 |
+
|
30 |
+
for i in (pbar := tqdm(range(num_imgs))):
|
31 |
+
|
32 |
+
pose = poses[i]
|
33 |
+
y, depth = render_one_view(vox, aabb, H, W, K, pose)
|
34 |
+
if isinstance(score_model, StableDiffusion):
|
35 |
+
y = score_model.decode(y)
|
36 |
+
pane, img, depth = vis_routine(y, depth)
|
37 |
+
|
38 |
+
# metric.put_artifact(
|
39 |
+
# "view_seq", ".mp4",
|
40 |
+
# lambda fn: stitch_vis(fn, read_stats(metric.output_dir, "view")[1])
|
41 |
+
# )
|
42 |
+
|
43 |
+
def vis_routine(y, depth):
|
44 |
+
pane = nerf_vis(y, depth, final_H=256)
|
45 |
+
im = torch_samps_to_imgs(y)[0]
|
46 |
+
depth = depth.cpu().numpy()
|
47 |
+
return pane, im, depth
|
48 |
+
|
49 |
+
|
50 |
+
if __name__ == "__main__":
|
51 |
+
# cfgs = {'gddpm': {'model': 'm_lsun_256', 'lsun_cat': 'bedroom', 'imgnet_cat': -1}, 'sd': {'variant': 'v1', 'v2_highres': False, 'prompt': 'A high quality photo of a delicious burger', 'scale': 100.0, 'precision': 'autocast'}, 'lr': 0.05, 'n_steps': 10000, 'emptiness_scale': 10, 'emptiness_weight': 10000, 'emptiness_step': 0.5, 'emptiness_multiplier': 20.0, 'depth_weight': 0, 'var_red': True}
|
52 |
+
pose = PoseConfig(rend_hw=64, FoV=60.0, R=1.5)
|
53 |
+
poser = pose.make()
|
54 |
+
sd_model = SD(variant='v1', v2_highres=False, prompt='A high quality photo of a delicious burger', scale=100.0, precision='autocast')
|
55 |
+
model = sd_model.make()
|
56 |
+
vox = VoxConfig(
|
57 |
+
model_type="V_SD", grid_size=100, density_shift=-1.0, c=4,
|
58 |
+
blend_bg_texture=True, bg_texture_hw=4,
|
59 |
+
bbox_len=1.0)
|
60 |
+
vox = vox.make()
|
61 |
+
|
62 |
+
lr = 0.05
|
63 |
+
n_steps = 10000
|
64 |
+
emptiness_scale = 10
|
65 |
+
emptiness_weight = 10000
|
66 |
+
emptiness_step = 0.5
|
67 |
+
emptiness_multiplier = 20.0
|
68 |
+
depth_weight = 0
|
69 |
+
var_red = True
|
70 |
+
|
71 |
+
assert model.samps_centered()
|
72 |
+
_, target_H, target_W = model.data_shape()
|
73 |
+
bs = 1
|
74 |
+
aabb = vox.aabb.T.cpu().numpy()
|
75 |
+
vox = vox.to(device_glb)
|
76 |
+
opt = torch.optim.Adamax(vox.opt_params(), lr=lr)
|
77 |
+
|
78 |
+
H, W = poser.H, poser.W
|
79 |
+
Ks, poses, prompt_prefixes = poser.sample_train(n_steps)
|
80 |
+
|
81 |
+
ts = model.us[30:-10]
|
82 |
+
|
83 |
+
same_noise = torch.randn(1, 4, H, W, device=model.device).repeat(bs, 1, 1, 1)
|
84 |
+
|
85 |
+
with tqdm(total=n_steps) as pbar:
|
86 |
+
for i in range(n_steps):
|
87 |
+
|
88 |
+
p = f"{prompt_prefixes[i]} {model.prompt}"
|
89 |
+
score_conds = model.prompts_emb([p])
|
90 |
+
|
91 |
+
y, depth, ws = render_one_view(vox, aabb, H, W, Ks[i], poses[i], return_w=True)
|
92 |
+
|
93 |
+
if isinstance(model, StableDiffusion):
|
94 |
+
pass
|
95 |
+
else:
|
96 |
+
y = torch.nn.functional.interpolate(y, (target_H, target_W), mode='bilinear')
|
97 |
+
|
98 |
+
opt.zero_grad()
|
99 |
+
|
100 |
+
with torch.no_grad():
|
101 |
+
chosen_σs = np.random.choice(ts, bs, replace=False)
|
102 |
+
chosen_σs = chosen_σs.reshape(-1, 1, 1, 1)
|
103 |
+
chosen_σs = torch.as_tensor(chosen_σs, device=model.device, dtype=torch.float32)
|
104 |
+
# chosen_σs = us[i]
|
105 |
+
|
106 |
+
noise = torch.randn(bs, *y.shape[1:], device=model.device)
|
107 |
+
|
108 |
+
zs = y + chosen_σs * noise
|
109 |
+
Ds = model.denoise(zs, chosen_σs, **score_conds)
|
110 |
+
|
111 |
+
if var_red:
|
112 |
+
grad = (Ds - y) / chosen_σs
|
113 |
+
else:
|
114 |
+
grad = (Ds - zs) / chosen_σs
|
115 |
+
|
116 |
+
grad = grad.mean(0, keepdim=True)
|
117 |
+
|
118 |
+
y.backward(-grad, retain_graph=True)
|
119 |
+
|
120 |
+
if depth_weight > 0:
|
121 |
+
center_depth = depth[7:-7, 7:-7]
|
122 |
+
border_depth_mean = (depth.sum() - center_depth.sum()) / (64*64-50*50)
|
123 |
+
center_depth_mean = center_depth.mean()
|
124 |
+
depth_diff = center_depth_mean - border_depth_mean
|
125 |
+
depth_loss = - torch.log(depth_diff + 1e-12)
|
126 |
+
depth_loss = depth_weight * depth_loss
|
127 |
+
depth_loss.backward(retain_graph=True)
|
128 |
+
|
129 |
+
emptiness_loss = torch.log(1 + emptiness_scale * ws).mean()
|
130 |
+
emptiness_loss = emptiness_weight * emptiness_loss
|
131 |
+
if emptiness_step * n_steps <= i:
|
132 |
+
emptiness_loss *= emptiness_multiplier
|
133 |
+
emptiness_loss.backward()
|
134 |
+
|
135 |
+
opt.step()
|
136 |
+
|
137 |
+
|
138 |
+
# metric.put_scalars(**tsr_stats(y))
|
139 |
+
|
140 |
+
if every(pbar, percent=1):
|
141 |
+
with torch.no_grad():
|
142 |
+
if isinstance(model, StableDiffusion):
|
143 |
+
y = model.decode(y)
|
144 |
+
pane, img, depth = vis_routine(y, depth)
|
145 |
+
|
146 |
+
# TODO: Output pane, img and depth to Gradio
|
147 |
+
|
148 |
+
pbar.update()
|
149 |
+
pbar.set_description(p)
|
150 |
+
|
151 |
+
# TODO: Save Checkpoint
|
152 |
+
ckpt = vox.state_dict()
|
153 |
+
# evaluate(model, vox, poser)
|
154 |
+
|
155 |
+
# TODO: Add code to stitch together the images and save them to a video
|
env.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"data_root": "release"
|
3 |
+
}
|
guided_diffusion/README.md
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Selected modules from OpenAI's [guided diffusion](https://github.com/openai/guided-diffusion), retrieved at commit `22e0df8183507e13a7813f8d38d51b072ca1e67c`
|
2 |
+
|
3 |
+
It's a bare minimum set of files needed to run their pretrained models. You can download these model checkpoints following the instructions in their repository README
|
4 |
+
|
5 |
+
Some modifications are made to remove the distributed processing utilities in order to reduce code complexity.
|
guided_diffusion/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Codebase for "Improved Denoising Diffusion Probabilistic Models".
|
3 |
+
"""
|
guided_diffusion/fp16_util.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Helpers to train with 16-bit precision.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import torch as th
|
7 |
+
import torch.nn as nn
|
8 |
+
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
|
9 |
+
|
10 |
+
# from . import logger
|
11 |
+
|
12 |
+
INITIAL_LOG_LOSS_SCALE = 20.0
|
13 |
+
|
14 |
+
|
15 |
+
def convert_module_to_f16(l):
|
16 |
+
"""
|
17 |
+
Convert primitive modules to float16.
|
18 |
+
"""
|
19 |
+
if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
|
20 |
+
l.weight.data = l.weight.data.half()
|
21 |
+
if l.bias is not None:
|
22 |
+
l.bias.data = l.bias.data.half()
|
23 |
+
|
24 |
+
|
25 |
+
def convert_module_to_f32(l):
|
26 |
+
"""
|
27 |
+
Convert primitive modules to float32, undoing convert_module_to_f16().
|
28 |
+
"""
|
29 |
+
if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
|
30 |
+
l.weight.data = l.weight.data.float()
|
31 |
+
if l.bias is not None:
|
32 |
+
l.bias.data = l.bias.data.float()
|
33 |
+
|
34 |
+
|
35 |
+
def make_master_params(param_groups_and_shapes):
|
36 |
+
"""
|
37 |
+
Copy model parameters into a (differently-shaped) list of full-precision
|
38 |
+
parameters.
|
39 |
+
"""
|
40 |
+
master_params = []
|
41 |
+
for param_group, shape in param_groups_and_shapes:
|
42 |
+
master_param = nn.Parameter(
|
43 |
+
_flatten_dense_tensors(
|
44 |
+
[param.detach().float() for (_, param) in param_group]
|
45 |
+
).view(shape)
|
46 |
+
)
|
47 |
+
master_param.requires_grad = True
|
48 |
+
master_params.append(master_param)
|
49 |
+
return master_params
|
50 |
+
|
51 |
+
|
52 |
+
def model_grads_to_master_grads(param_groups_and_shapes, master_params):
|
53 |
+
"""
|
54 |
+
Copy the gradients from the model parameters into the master parameters
|
55 |
+
from make_master_params().
|
56 |
+
"""
|
57 |
+
for master_param, (param_group, shape) in zip(
|
58 |
+
master_params, param_groups_and_shapes
|
59 |
+
):
|
60 |
+
master_param.grad = _flatten_dense_tensors(
|
61 |
+
[param_grad_or_zeros(param) for (_, param) in param_group]
|
62 |
+
).view(shape)
|
63 |
+
|
64 |
+
|
65 |
+
def master_params_to_model_params(param_groups_and_shapes, master_params):
|
66 |
+
"""
|
67 |
+
Copy the master parameter data back into the model parameters.
|
68 |
+
"""
|
69 |
+
# Without copying to a list, if a generator is passed, this will
|
70 |
+
# silently not copy any parameters.
|
71 |
+
for master_param, (param_group, _) in zip(master_params, param_groups_and_shapes):
|
72 |
+
for (_, param), unflat_master_param in zip(
|
73 |
+
param_group, unflatten_master_params(param_group, master_param.view(-1))
|
74 |
+
):
|
75 |
+
param.detach().copy_(unflat_master_param)
|
76 |
+
|
77 |
+
|
78 |
+
def unflatten_master_params(param_group, master_param):
|
79 |
+
return _unflatten_dense_tensors(master_param, [param for (_, param) in param_group])
|
80 |
+
|
81 |
+
|
82 |
+
def get_param_groups_and_shapes(named_model_params):
|
83 |
+
named_model_params = list(named_model_params)
|
84 |
+
scalar_vector_named_params = (
|
85 |
+
[(n, p) for (n, p) in named_model_params if p.ndim <= 1],
|
86 |
+
(-1),
|
87 |
+
)
|
88 |
+
matrix_named_params = (
|
89 |
+
[(n, p) for (n, p) in named_model_params if p.ndim > 1],
|
90 |
+
(1, -1),
|
91 |
+
)
|
92 |
+
return [scalar_vector_named_params, matrix_named_params]
|
93 |
+
|
94 |
+
|
95 |
+
def master_params_to_state_dict(
|
96 |
+
model, param_groups_and_shapes, master_params, use_fp16
|
97 |
+
):
|
98 |
+
if use_fp16:
|
99 |
+
state_dict = model.state_dict()
|
100 |
+
for master_param, (param_group, _) in zip(
|
101 |
+
master_params, param_groups_and_shapes
|
102 |
+
):
|
103 |
+
for (name, _), unflat_master_param in zip(
|
104 |
+
param_group, unflatten_master_params(param_group, master_param.view(-1))
|
105 |
+
):
|
106 |
+
assert name in state_dict
|
107 |
+
state_dict[name] = unflat_master_param
|
108 |
+
else:
|
109 |
+
state_dict = model.state_dict()
|
110 |
+
for i, (name, _value) in enumerate(model.named_parameters()):
|
111 |
+
assert name in state_dict
|
112 |
+
state_dict[name] = master_params[i]
|
113 |
+
return state_dict
|
114 |
+
|
115 |
+
|
116 |
+
def state_dict_to_master_params(model, state_dict, use_fp16):
|
117 |
+
if use_fp16:
|
118 |
+
named_model_params = [
|
119 |
+
(name, state_dict[name]) for name, _ in model.named_parameters()
|
120 |
+
]
|
121 |
+
param_groups_and_shapes = get_param_groups_and_shapes(named_model_params)
|
122 |
+
master_params = make_master_params(param_groups_and_shapes)
|
123 |
+
else:
|
124 |
+
master_params = [state_dict[name] for name, _ in model.named_parameters()]
|
125 |
+
return master_params
|
126 |
+
|
127 |
+
|
128 |
+
def zero_master_grads(master_params):
|
129 |
+
for param in master_params:
|
130 |
+
param.grad = None
|
131 |
+
|
132 |
+
|
133 |
+
def zero_grad(model_params):
|
134 |
+
for param in model_params:
|
135 |
+
# Taken from https://pytorch.org/docs/stable/_modules/torch/optim/optimizer.html#Optimizer.add_param_group
|
136 |
+
if param.grad is not None:
|
137 |
+
param.grad.detach_()
|
138 |
+
param.grad.zero_()
|
139 |
+
|
140 |
+
|
141 |
+
def param_grad_or_zeros(param):
|
142 |
+
if param.grad is not None:
|
143 |
+
return param.grad.data.detach()
|
144 |
+
else:
|
145 |
+
return th.zeros_like(param)
|
146 |
+
|
147 |
+
|
148 |
+
class MixedPrecisionTrainer:
|
149 |
+
def __init__(
|
150 |
+
self,
|
151 |
+
*,
|
152 |
+
model,
|
153 |
+
use_fp16=False,
|
154 |
+
fp16_scale_growth=1e-3,
|
155 |
+
initial_lg_loss_scale=INITIAL_LOG_LOSS_SCALE,
|
156 |
+
):
|
157 |
+
self.model = model
|
158 |
+
self.use_fp16 = use_fp16
|
159 |
+
self.fp16_scale_growth = fp16_scale_growth
|
160 |
+
|
161 |
+
self.model_params = list(self.model.parameters())
|
162 |
+
self.master_params = self.model_params
|
163 |
+
self.param_groups_and_shapes = None
|
164 |
+
self.lg_loss_scale = initial_lg_loss_scale
|
165 |
+
|
166 |
+
if self.use_fp16:
|
167 |
+
self.param_groups_and_shapes = get_param_groups_and_shapes(
|
168 |
+
self.model.named_parameters()
|
169 |
+
)
|
170 |
+
self.master_params = make_master_params(self.param_groups_and_shapes)
|
171 |
+
self.model.convert_to_fp16()
|
172 |
+
|
173 |
+
def zero_grad(self):
|
174 |
+
zero_grad(self.model_params)
|
175 |
+
|
176 |
+
def backward(self, loss: th.Tensor):
|
177 |
+
if self.use_fp16:
|
178 |
+
loss_scale = 2 ** self.lg_loss_scale
|
179 |
+
(loss * loss_scale).backward()
|
180 |
+
else:
|
181 |
+
loss.backward()
|
182 |
+
|
183 |
+
def optimize(self, opt: th.optim.Optimizer):
|
184 |
+
if self.use_fp16:
|
185 |
+
return self._optimize_fp16(opt)
|
186 |
+
else:
|
187 |
+
return self._optimize_normal(opt)
|
188 |
+
|
189 |
+
def _optimize_fp16(self, opt: th.optim.Optimizer):
|
190 |
+
logger.logkv_mean("lg_loss_scale", self.lg_loss_scale)
|
191 |
+
model_grads_to_master_grads(self.param_groups_and_shapes, self.master_params)
|
192 |
+
grad_norm, param_norm = self._compute_norms(grad_scale=2 ** self.lg_loss_scale)
|
193 |
+
if check_overflow(grad_norm):
|
194 |
+
self.lg_loss_scale -= 1
|
195 |
+
logger.log(f"Found NaN, decreased lg_loss_scale to {self.lg_loss_scale}")
|
196 |
+
zero_master_grads(self.master_params)
|
197 |
+
return False
|
198 |
+
|
199 |
+
logger.logkv_mean("grad_norm", grad_norm)
|
200 |
+
logger.logkv_mean("param_norm", param_norm)
|
201 |
+
|
202 |
+
for p in self.master_params:
|
203 |
+
p.grad.mul_(1.0 / (2 ** self.lg_loss_scale))
|
204 |
+
opt.step()
|
205 |
+
zero_master_grads(self.master_params)
|
206 |
+
master_params_to_model_params(self.param_groups_and_shapes, self.master_params)
|
207 |
+
self.lg_loss_scale += self.fp16_scale_growth
|
208 |
+
return True
|
209 |
+
|
210 |
+
def _optimize_normal(self, opt: th.optim.Optimizer):
|
211 |
+
grad_norm, param_norm = self._compute_norms()
|
212 |
+
logger.logkv_mean("grad_norm", grad_norm)
|
213 |
+
logger.logkv_mean("param_norm", param_norm)
|
214 |
+
opt.step()
|
215 |
+
return True
|
216 |
+
|
217 |
+
def _compute_norms(self, grad_scale=1.0):
|
218 |
+
grad_norm = 0.0
|
219 |
+
param_norm = 0.0
|
220 |
+
for p in self.master_params:
|
221 |
+
with th.no_grad():
|
222 |
+
param_norm += th.norm(p, p=2, dtype=th.float32).item() ** 2
|
223 |
+
if p.grad is not None:
|
224 |
+
grad_norm += th.norm(p.grad, p=2, dtype=th.float32).item() ** 2
|
225 |
+
return np.sqrt(grad_norm) / grad_scale, np.sqrt(param_norm)
|
226 |
+
|
227 |
+
def master_params_to_state_dict(self, master_params):
|
228 |
+
return master_params_to_state_dict(
|
229 |
+
self.model, self.param_groups_and_shapes, master_params, self.use_fp16
|
230 |
+
)
|
231 |
+
|
232 |
+
def state_dict_to_master_params(self, state_dict):
|
233 |
+
return state_dict_to_master_params(self.model, state_dict, self.use_fp16)
|
234 |
+
|
235 |
+
|
236 |
+
def check_overflow(value):
|
237 |
+
return (value == float("inf")) or (value == -float("inf")) or (value != value)
|
guided_diffusion/gaussian_diffusion.py
ADDED
@@ -0,0 +1,908 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
This code started out as a PyTorch port of Ho et al's diffusion models:
|
3 |
+
https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py
|
4 |
+
|
5 |
+
Docstrings have been added, as well as DDIM sampling and a new collection of beta schedules.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import enum
|
9 |
+
import math
|
10 |
+
|
11 |
+
import numpy as np
|
12 |
+
import torch as th
|
13 |
+
|
14 |
+
from .nn import mean_flat
|
15 |
+
from .losses import normal_kl, discretized_gaussian_log_likelihood
|
16 |
+
|
17 |
+
|
18 |
+
def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
|
19 |
+
"""
|
20 |
+
Get a pre-defined beta schedule for the given name.
|
21 |
+
|
22 |
+
The beta schedule library consists of beta schedules which remain similar
|
23 |
+
in the limit of num_diffusion_timesteps.
|
24 |
+
Beta schedules may be added, but should not be removed or changed once
|
25 |
+
they are committed to maintain backwards compatibility.
|
26 |
+
"""
|
27 |
+
if schedule_name == "linear":
|
28 |
+
# Linear schedule from Ho et al, extended to work for any number of
|
29 |
+
# diffusion steps.
|
30 |
+
scale = 1000 / num_diffusion_timesteps
|
31 |
+
beta_start = scale * 0.0001
|
32 |
+
beta_end = scale * 0.02
|
33 |
+
return np.linspace(
|
34 |
+
beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64
|
35 |
+
)
|
36 |
+
elif schedule_name == "cosine":
|
37 |
+
return betas_for_alpha_bar(
|
38 |
+
num_diffusion_timesteps,
|
39 |
+
lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
|
40 |
+
)
|
41 |
+
else:
|
42 |
+
raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
|
43 |
+
|
44 |
+
|
45 |
+
def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
|
46 |
+
"""
|
47 |
+
Create a beta schedule that discretizes the given alpha_t_bar function,
|
48 |
+
which defines the cumulative product of (1-beta) over time from t = [0,1].
|
49 |
+
|
50 |
+
:param num_diffusion_timesteps: the number of betas to produce.
|
51 |
+
:param alpha_bar: a lambda that takes an argument t from 0 to 1 and
|
52 |
+
produces the cumulative product of (1-beta) up to that
|
53 |
+
part of the diffusion process.
|
54 |
+
:param max_beta: the maximum beta to use; use values lower than 1 to
|
55 |
+
prevent singularities.
|
56 |
+
"""
|
57 |
+
betas = []
|
58 |
+
for i in range(num_diffusion_timesteps):
|
59 |
+
t1 = i / num_diffusion_timesteps
|
60 |
+
t2 = (i + 1) / num_diffusion_timesteps
|
61 |
+
betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
|
62 |
+
return np.array(betas)
|
63 |
+
|
64 |
+
|
65 |
+
class ModelMeanType(enum.Enum):
|
66 |
+
"""
|
67 |
+
Which type of output the model predicts.
|
68 |
+
"""
|
69 |
+
|
70 |
+
PREVIOUS_X = enum.auto() # the model predicts x_{t-1}
|
71 |
+
START_X = enum.auto() # the model predicts x_0
|
72 |
+
EPSILON = enum.auto() # the model predicts epsilon
|
73 |
+
|
74 |
+
|
75 |
+
class ModelVarType(enum.Enum):
|
76 |
+
"""
|
77 |
+
What is used as the model's output variance.
|
78 |
+
|
79 |
+
The LEARNED_RANGE option has been added to allow the model to predict
|
80 |
+
values between FIXED_SMALL and FIXED_LARGE, making its job easier.
|
81 |
+
"""
|
82 |
+
|
83 |
+
LEARNED = enum.auto()
|
84 |
+
FIXED_SMALL = enum.auto()
|
85 |
+
FIXED_LARGE = enum.auto()
|
86 |
+
LEARNED_RANGE = enum.auto()
|
87 |
+
|
88 |
+
|
89 |
+
class LossType(enum.Enum):
|
90 |
+
MSE = enum.auto() # use raw MSE loss (and KL when learning variances)
|
91 |
+
RESCALED_MSE = (
|
92 |
+
enum.auto()
|
93 |
+
) # use raw MSE loss (with RESCALED_KL when learning variances)
|
94 |
+
KL = enum.auto() # use the variational lower-bound
|
95 |
+
RESCALED_KL = enum.auto() # like KL, but rescale to estimate the full VLB
|
96 |
+
|
97 |
+
def is_vb(self):
|
98 |
+
return self == LossType.KL or self == LossType.RESCALED_KL
|
99 |
+
|
100 |
+
|
101 |
+
class GaussianDiffusion:
|
102 |
+
"""
|
103 |
+
Utilities for training and sampling diffusion models.
|
104 |
+
|
105 |
+
Ported directly from here, and then adapted over time to further experimentation.
|
106 |
+
https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
|
107 |
+
|
108 |
+
:param betas: a 1-D numpy array of betas for each diffusion timestep,
|
109 |
+
starting at T and going to 1.
|
110 |
+
:param model_mean_type: a ModelMeanType determining what the model outputs.
|
111 |
+
:param model_var_type: a ModelVarType determining how variance is output.
|
112 |
+
:param loss_type: a LossType determining the loss function to use.
|
113 |
+
:param rescale_timesteps: if True, pass floating point timesteps into the
|
114 |
+
model so that they are always scaled like in the
|
115 |
+
original paper (0 to 1000).
|
116 |
+
"""
|
117 |
+
|
118 |
+
def __init__(
|
119 |
+
self,
|
120 |
+
*,
|
121 |
+
betas,
|
122 |
+
model_mean_type,
|
123 |
+
model_var_type,
|
124 |
+
loss_type,
|
125 |
+
rescale_timesteps=False,
|
126 |
+
):
|
127 |
+
self.model_mean_type = model_mean_type
|
128 |
+
self.model_var_type = model_var_type
|
129 |
+
self.loss_type = loss_type
|
130 |
+
self.rescale_timesteps = rescale_timesteps
|
131 |
+
|
132 |
+
# Use float64 for accuracy.
|
133 |
+
betas = np.array(betas, dtype=np.float64)
|
134 |
+
self.betas = betas
|
135 |
+
assert len(betas.shape) == 1, "betas must be 1-D"
|
136 |
+
assert (betas > 0).all() and (betas <= 1).all()
|
137 |
+
|
138 |
+
self.num_timesteps = int(betas.shape[0])
|
139 |
+
|
140 |
+
alphas = 1.0 - betas
|
141 |
+
self.alphas_cumprod = np.cumprod(alphas, axis=0)
|
142 |
+
self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
|
143 |
+
self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
|
144 |
+
assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
|
145 |
+
|
146 |
+
# calculations for diffusion q(x_t | x_{t-1}) and others
|
147 |
+
self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
|
148 |
+
self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
|
149 |
+
self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
|
150 |
+
self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
|
151 |
+
self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
|
152 |
+
|
153 |
+
# calculations for posterior q(x_{t-1} | x_t, x_0)
|
154 |
+
self.posterior_variance = (
|
155 |
+
betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
|
156 |
+
)
|
157 |
+
# log calculation clipped because the posterior variance is 0 at the
|
158 |
+
# beginning of the diffusion chain.
|
159 |
+
self.posterior_log_variance_clipped = np.log(
|
160 |
+
np.append(self.posterior_variance[1], self.posterior_variance[1:])
|
161 |
+
)
|
162 |
+
self.posterior_mean_coef1 = (
|
163 |
+
betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
|
164 |
+
)
|
165 |
+
self.posterior_mean_coef2 = (
|
166 |
+
(1.0 - self.alphas_cumprod_prev)
|
167 |
+
* np.sqrt(alphas)
|
168 |
+
/ (1.0 - self.alphas_cumprod)
|
169 |
+
)
|
170 |
+
|
171 |
+
def q_mean_variance(self, x_start, t):
|
172 |
+
"""
|
173 |
+
Get the distribution q(x_t | x_0).
|
174 |
+
|
175 |
+
:param x_start: the [N x C x ...] tensor of noiseless inputs.
|
176 |
+
:param t: the number of diffusion steps (minus 1). Here, 0 means one step.
|
177 |
+
:return: A tuple (mean, variance, log_variance), all of x_start's shape.
|
178 |
+
"""
|
179 |
+
mean = (
|
180 |
+
_extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
|
181 |
+
)
|
182 |
+
variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
|
183 |
+
log_variance = _extract_into_tensor(
|
184 |
+
self.log_one_minus_alphas_cumprod, t, x_start.shape
|
185 |
+
)
|
186 |
+
return mean, variance, log_variance
|
187 |
+
|
188 |
+
def q_sample(self, x_start, t, noise=None):
|
189 |
+
"""
|
190 |
+
Diffuse the data for a given number of diffusion steps.
|
191 |
+
|
192 |
+
In other words, sample from q(x_t | x_0).
|
193 |
+
|
194 |
+
:param x_start: the initial data batch.
|
195 |
+
:param t: the number of diffusion steps (minus 1). Here, 0 means one step.
|
196 |
+
:param noise: if specified, the split-out normal noise.
|
197 |
+
:return: A noisy version of x_start.
|
198 |
+
"""
|
199 |
+
if noise is None:
|
200 |
+
noise = th.randn_like(x_start)
|
201 |
+
assert noise.shape == x_start.shape
|
202 |
+
return (
|
203 |
+
_extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
|
204 |
+
+ _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape)
|
205 |
+
* noise
|
206 |
+
)
|
207 |
+
|
208 |
+
def q_posterior_mean_variance(self, x_start, x_t, t):
|
209 |
+
"""
|
210 |
+
Compute the mean and variance of the diffusion posterior:
|
211 |
+
|
212 |
+
q(x_{t-1} | x_t, x_0)
|
213 |
+
|
214 |
+
"""
|
215 |
+
assert x_start.shape == x_t.shape
|
216 |
+
posterior_mean = (
|
217 |
+
_extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
|
218 |
+
+ _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
|
219 |
+
)
|
220 |
+
posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
|
221 |
+
posterior_log_variance_clipped = _extract_into_tensor(
|
222 |
+
self.posterior_log_variance_clipped, t, x_t.shape
|
223 |
+
)
|
224 |
+
assert (
|
225 |
+
posterior_mean.shape[0]
|
226 |
+
== posterior_variance.shape[0]
|
227 |
+
== posterior_log_variance_clipped.shape[0]
|
228 |
+
== x_start.shape[0]
|
229 |
+
)
|
230 |
+
return posterior_mean, posterior_variance, posterior_log_variance_clipped
|
231 |
+
|
232 |
+
def p_mean_variance(
|
233 |
+
self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None
|
234 |
+
):
|
235 |
+
"""
|
236 |
+
Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
|
237 |
+
the initial x, x_0.
|
238 |
+
|
239 |
+
:param model: the model, which takes a signal and a batch of timesteps
|
240 |
+
as input.
|
241 |
+
:param x: the [N x C x ...] tensor at time t.
|
242 |
+
:param t: a 1-D Tensor of timesteps.
|
243 |
+
:param clip_denoised: if True, clip the denoised signal into [-1, 1].
|
244 |
+
:param denoised_fn: if not None, a function which applies to the
|
245 |
+
x_start prediction before it is used to sample. Applies before
|
246 |
+
clip_denoised.
|
247 |
+
:param model_kwargs: if not None, a dict of extra keyword arguments to
|
248 |
+
pass to the model. This can be used for conditioning.
|
249 |
+
:return: a dict with the following keys:
|
250 |
+
- 'mean': the model mean output.
|
251 |
+
- 'variance': the model variance output.
|
252 |
+
- 'log_variance': the log of 'variance'.
|
253 |
+
- 'pred_xstart': the prediction for x_0.
|
254 |
+
"""
|
255 |
+
if model_kwargs is None:
|
256 |
+
model_kwargs = {}
|
257 |
+
|
258 |
+
B, C = x.shape[:2]
|
259 |
+
assert t.shape == (B,)
|
260 |
+
model_output = model(x, self._scale_timesteps(t), **model_kwargs)
|
261 |
+
|
262 |
+
if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
|
263 |
+
assert model_output.shape == (B, C * 2, *x.shape[2:])
|
264 |
+
model_output, model_var_values = th.split(model_output, C, dim=1)
|
265 |
+
if self.model_var_type == ModelVarType.LEARNED:
|
266 |
+
model_log_variance = model_var_values
|
267 |
+
model_variance = th.exp(model_log_variance)
|
268 |
+
else:
|
269 |
+
min_log = _extract_into_tensor(
|
270 |
+
self.posterior_log_variance_clipped, t, x.shape
|
271 |
+
)
|
272 |
+
max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
|
273 |
+
# The model_var_values is [-1, 1] for [min_var, max_var].
|
274 |
+
frac = (model_var_values + 1) / 2
|
275 |
+
model_log_variance = frac * max_log + (1 - frac) * min_log
|
276 |
+
model_variance = th.exp(model_log_variance)
|
277 |
+
else:
|
278 |
+
model_variance, model_log_variance = {
|
279 |
+
# for fixedlarge, we set the initial (log-)variance like so
|
280 |
+
# to get a better decoder log likelihood.
|
281 |
+
ModelVarType.FIXED_LARGE: (
|
282 |
+
np.append(self.posterior_variance[1], self.betas[1:]),
|
283 |
+
np.log(np.append(self.posterior_variance[1], self.betas[1:])),
|
284 |
+
),
|
285 |
+
ModelVarType.FIXED_SMALL: (
|
286 |
+
self.posterior_variance,
|
287 |
+
self.posterior_log_variance_clipped,
|
288 |
+
),
|
289 |
+
}[self.model_var_type]
|
290 |
+
model_variance = _extract_into_tensor(model_variance, t, x.shape)
|
291 |
+
model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
|
292 |
+
|
293 |
+
def process_xstart(x):
|
294 |
+
if denoised_fn is not None:
|
295 |
+
x = denoised_fn(x)
|
296 |
+
if clip_denoised:
|
297 |
+
return x.clamp(-1, 1)
|
298 |
+
return x
|
299 |
+
|
300 |
+
if self.model_mean_type == ModelMeanType.PREVIOUS_X:
|
301 |
+
pred_xstart = process_xstart(
|
302 |
+
self._predict_xstart_from_xprev(x_t=x, t=t, xprev=model_output)
|
303 |
+
)
|
304 |
+
model_mean = model_output
|
305 |
+
elif self.model_mean_type in [ModelMeanType.START_X, ModelMeanType.EPSILON]:
|
306 |
+
if self.model_mean_type == ModelMeanType.START_X:
|
307 |
+
pred_xstart = process_xstart(model_output)
|
308 |
+
else:
|
309 |
+
pred_xstart = process_xstart(
|
310 |
+
self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
|
311 |
+
)
|
312 |
+
model_mean, _, _ = self.q_posterior_mean_variance(
|
313 |
+
x_start=pred_xstart, x_t=x, t=t
|
314 |
+
)
|
315 |
+
else:
|
316 |
+
raise NotImplementedError(self.model_mean_type)
|
317 |
+
|
318 |
+
assert (
|
319 |
+
model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
|
320 |
+
)
|
321 |
+
return {
|
322 |
+
"mean": model_mean,
|
323 |
+
"variance": model_variance,
|
324 |
+
"log_variance": model_log_variance,
|
325 |
+
"pred_xstart": pred_xstart,
|
326 |
+
}
|
327 |
+
|
328 |
+
def _predict_xstart_from_eps(self, x_t, t, eps):
|
329 |
+
assert x_t.shape == eps.shape
|
330 |
+
return (
|
331 |
+
_extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
|
332 |
+
- _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
|
333 |
+
)
|
334 |
+
|
335 |
+
def _predict_xstart_from_xprev(self, x_t, t, xprev):
|
336 |
+
assert x_t.shape == xprev.shape
|
337 |
+
return ( # (xprev - coef2*x_t) / coef1
|
338 |
+
_extract_into_tensor(1.0 / self.posterior_mean_coef1, t, x_t.shape) * xprev
|
339 |
+
- _extract_into_tensor(
|
340 |
+
self.posterior_mean_coef2 / self.posterior_mean_coef1, t, x_t.shape
|
341 |
+
)
|
342 |
+
* x_t
|
343 |
+
)
|
344 |
+
|
345 |
+
def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
|
346 |
+
return (
|
347 |
+
_extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
|
348 |
+
- pred_xstart
|
349 |
+
) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
|
350 |
+
|
351 |
+
def _scale_timesteps(self, t):
|
352 |
+
if self.rescale_timesteps:
|
353 |
+
return t.float() * (1000.0 / self.num_timesteps)
|
354 |
+
return t
|
355 |
+
|
356 |
+
def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
|
357 |
+
"""
|
358 |
+
Compute the mean for the previous step, given a function cond_fn that
|
359 |
+
computes the gradient of a conditional log probability with respect to
|
360 |
+
x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
|
361 |
+
condition on y.
|
362 |
+
|
363 |
+
This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
|
364 |
+
"""
|
365 |
+
gradient = cond_fn(x, self._scale_timesteps(t), **model_kwargs)
|
366 |
+
new_mean = (
|
367 |
+
p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
|
368 |
+
)
|
369 |
+
return new_mean
|
370 |
+
|
371 |
+
def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
|
372 |
+
"""
|
373 |
+
Compute what the p_mean_variance output would have been, should the
|
374 |
+
model's score function be conditioned by cond_fn.
|
375 |
+
|
376 |
+
See condition_mean() for details on cond_fn.
|
377 |
+
|
378 |
+
Unlike condition_mean(), this instead uses the conditioning strategy
|
379 |
+
from Song et al (2020).
|
380 |
+
"""
|
381 |
+
alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
|
382 |
+
|
383 |
+
eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
|
384 |
+
eps = eps - (1 - alpha_bar).sqrt() * cond_fn(
|
385 |
+
x, self._scale_timesteps(t), **model_kwargs
|
386 |
+
)
|
387 |
+
|
388 |
+
out = p_mean_var.copy()
|
389 |
+
out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
|
390 |
+
out["mean"], _, _ = self.q_posterior_mean_variance(
|
391 |
+
x_start=out["pred_xstart"], x_t=x, t=t
|
392 |
+
)
|
393 |
+
return out
|
394 |
+
|
395 |
+
def p_sample(
|
396 |
+
self,
|
397 |
+
model,
|
398 |
+
x,
|
399 |
+
t,
|
400 |
+
clip_denoised=True,
|
401 |
+
denoised_fn=None,
|
402 |
+
cond_fn=None,
|
403 |
+
model_kwargs=None,
|
404 |
+
):
|
405 |
+
"""
|
406 |
+
Sample x_{t-1} from the model at the given timestep.
|
407 |
+
|
408 |
+
:param model: the model to sample from.
|
409 |
+
:param x: the current tensor at x_{t-1}.
|
410 |
+
:param t: the value of t, starting at 0 for the first diffusion step.
|
411 |
+
:param clip_denoised: if True, clip the x_start prediction to [-1, 1].
|
412 |
+
:param denoised_fn: if not None, a function which applies to the
|
413 |
+
x_start prediction before it is used to sample.
|
414 |
+
:param cond_fn: if not None, this is a gradient function that acts
|
415 |
+
similarly to the model.
|
416 |
+
:param model_kwargs: if not None, a dict of extra keyword arguments to
|
417 |
+
pass to the model. This can be used for conditioning.
|
418 |
+
:return: a dict containing the following keys:
|
419 |
+
- 'sample': a random sample from the model.
|
420 |
+
- 'pred_xstart': a prediction of x_0.
|
421 |
+
"""
|
422 |
+
out = self.p_mean_variance(
|
423 |
+
model,
|
424 |
+
x,
|
425 |
+
t,
|
426 |
+
clip_denoised=clip_denoised,
|
427 |
+
denoised_fn=denoised_fn,
|
428 |
+
model_kwargs=model_kwargs,
|
429 |
+
)
|
430 |
+
noise = th.randn_like(x)
|
431 |
+
nonzero_mask = (
|
432 |
+
(t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
|
433 |
+
) # no noise when t == 0
|
434 |
+
if cond_fn is not None:
|
435 |
+
out["mean"] = self.condition_mean(
|
436 |
+
cond_fn, out, x, t, model_kwargs=model_kwargs
|
437 |
+
)
|
438 |
+
sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
|
439 |
+
return {"sample": sample, "pred_xstart": out["pred_xstart"]}
|
440 |
+
|
441 |
+
def p_sample_loop(
|
442 |
+
self,
|
443 |
+
model,
|
444 |
+
shape,
|
445 |
+
noise=None,
|
446 |
+
clip_denoised=True,
|
447 |
+
denoised_fn=None,
|
448 |
+
cond_fn=None,
|
449 |
+
model_kwargs=None,
|
450 |
+
device=None,
|
451 |
+
progress=False,
|
452 |
+
):
|
453 |
+
"""
|
454 |
+
Generate samples from the model.
|
455 |
+
|
456 |
+
:param model: the model module.
|
457 |
+
:param shape: the shape of the samples, (N, C, H, W).
|
458 |
+
:param noise: if specified, the noise from the encoder to sample.
|
459 |
+
Should be of the same shape as `shape`.
|
460 |
+
:param clip_denoised: if True, clip x_start predictions to [-1, 1].
|
461 |
+
:param denoised_fn: if not None, a function which applies to the
|
462 |
+
x_start prediction before it is used to sample.
|
463 |
+
:param cond_fn: if not None, this is a gradient function that acts
|
464 |
+
similarly to the model.
|
465 |
+
:param model_kwargs: if not None, a dict of extra keyword arguments to
|
466 |
+
pass to the model. This can be used for conditioning.
|
467 |
+
:param device: if specified, the device to create the samples on.
|
468 |
+
If not specified, use a model parameter's device.
|
469 |
+
:param progress: if True, show a tqdm progress bar.
|
470 |
+
:return: a non-differentiable batch of samples.
|
471 |
+
"""
|
472 |
+
final = None
|
473 |
+
for sample in self.p_sample_loop_progressive(
|
474 |
+
model,
|
475 |
+
shape,
|
476 |
+
noise=noise,
|
477 |
+
clip_denoised=clip_denoised,
|
478 |
+
denoised_fn=denoised_fn,
|
479 |
+
cond_fn=cond_fn,
|
480 |
+
model_kwargs=model_kwargs,
|
481 |
+
device=device,
|
482 |
+
progress=progress,
|
483 |
+
):
|
484 |
+
final = sample
|
485 |
+
return final["sample"]
|
486 |
+
|
487 |
+
def p_sample_loop_progressive(
|
488 |
+
self,
|
489 |
+
model,
|
490 |
+
shape,
|
491 |
+
noise=None,
|
492 |
+
clip_denoised=True,
|
493 |
+
denoised_fn=None,
|
494 |
+
cond_fn=None,
|
495 |
+
model_kwargs=None,
|
496 |
+
device=None,
|
497 |
+
progress=False,
|
498 |
+
):
|
499 |
+
"""
|
500 |
+
Generate samples from the model and yield intermediate samples from
|
501 |
+
each timestep of diffusion.
|
502 |
+
|
503 |
+
Arguments are the same as p_sample_loop().
|
504 |
+
Returns a generator over dicts, where each dict is the return value of
|
505 |
+
p_sample().
|
506 |
+
"""
|
507 |
+
if device is None:
|
508 |
+
device = next(model.parameters()).device
|
509 |
+
assert isinstance(shape, (tuple, list))
|
510 |
+
if noise is not None:
|
511 |
+
img = noise
|
512 |
+
else:
|
513 |
+
img = th.randn(*shape, device=device)
|
514 |
+
indices = list(range(self.num_timesteps))[::-1]
|
515 |
+
|
516 |
+
if progress:
|
517 |
+
# Lazy import so that we don't depend on tqdm.
|
518 |
+
from tqdm.auto import tqdm
|
519 |
+
|
520 |
+
indices = tqdm(indices)
|
521 |
+
|
522 |
+
for i in indices:
|
523 |
+
t = th.tensor([i] * shape[0], device=device)
|
524 |
+
with th.no_grad():
|
525 |
+
out = self.p_sample(
|
526 |
+
model,
|
527 |
+
img,
|
528 |
+
t,
|
529 |
+
clip_denoised=clip_denoised,
|
530 |
+
denoised_fn=denoised_fn,
|
531 |
+
cond_fn=cond_fn,
|
532 |
+
model_kwargs=model_kwargs,
|
533 |
+
)
|
534 |
+
yield out
|
535 |
+
img = out["sample"]
|
536 |
+
|
537 |
+
def ddim_sample(
|
538 |
+
self,
|
539 |
+
model,
|
540 |
+
x,
|
541 |
+
t,
|
542 |
+
clip_denoised=True,
|
543 |
+
denoised_fn=None,
|
544 |
+
cond_fn=None,
|
545 |
+
model_kwargs=None,
|
546 |
+
eta=0.0,
|
547 |
+
):
|
548 |
+
"""
|
549 |
+
Sample x_{t-1} from the model using DDIM.
|
550 |
+
|
551 |
+
Same usage as p_sample().
|
552 |
+
"""
|
553 |
+
out = self.p_mean_variance(
|
554 |
+
model,
|
555 |
+
x,
|
556 |
+
t,
|
557 |
+
clip_denoised=clip_denoised,
|
558 |
+
denoised_fn=denoised_fn,
|
559 |
+
model_kwargs=model_kwargs,
|
560 |
+
)
|
561 |
+
if cond_fn is not None:
|
562 |
+
out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
|
563 |
+
|
564 |
+
# Usually our model outputs epsilon, but we re-derive it
|
565 |
+
# in case we used x_start or x_prev prediction.
|
566 |
+
eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
|
567 |
+
|
568 |
+
alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
|
569 |
+
alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
|
570 |
+
sigma = (
|
571 |
+
eta
|
572 |
+
* th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
|
573 |
+
* th.sqrt(1 - alpha_bar / alpha_bar_prev)
|
574 |
+
)
|
575 |
+
# Equation 12.
|
576 |
+
noise = th.randn_like(x)
|
577 |
+
mean_pred = (
|
578 |
+
out["pred_xstart"] * th.sqrt(alpha_bar_prev)
|
579 |
+
+ th.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
|
580 |
+
)
|
581 |
+
nonzero_mask = (
|
582 |
+
(t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
|
583 |
+
) # no noise when t == 0
|
584 |
+
sample = mean_pred + nonzero_mask * sigma * noise
|
585 |
+
return {"sample": sample, "pred_xstart": out["pred_xstart"]}
|
586 |
+
|
587 |
+
def ddim_reverse_sample(
|
588 |
+
self,
|
589 |
+
model,
|
590 |
+
x,
|
591 |
+
t,
|
592 |
+
clip_denoised=True,
|
593 |
+
denoised_fn=None,
|
594 |
+
model_kwargs=None,
|
595 |
+
eta=0.0,
|
596 |
+
):
|
597 |
+
"""
|
598 |
+
Sample x_{t+1} from the model using DDIM reverse ODE.
|
599 |
+
"""
|
600 |
+
assert eta == 0.0, "Reverse ODE only for deterministic path"
|
601 |
+
out = self.p_mean_variance(
|
602 |
+
model,
|
603 |
+
x,
|
604 |
+
t,
|
605 |
+
clip_denoised=clip_denoised,
|
606 |
+
denoised_fn=denoised_fn,
|
607 |
+
model_kwargs=model_kwargs,
|
608 |
+
)
|
609 |
+
# Usually our model outputs epsilon, but we re-derive it
|
610 |
+
# in case we used x_start or x_prev prediction.
|
611 |
+
eps = (
|
612 |
+
_extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
|
613 |
+
- out["pred_xstart"]
|
614 |
+
) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
|
615 |
+
alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
|
616 |
+
|
617 |
+
# Equation 12. reversed
|
618 |
+
mean_pred = (
|
619 |
+
out["pred_xstart"] * th.sqrt(alpha_bar_next)
|
620 |
+
+ th.sqrt(1 - alpha_bar_next) * eps
|
621 |
+
)
|
622 |
+
|
623 |
+
return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
|
624 |
+
|
625 |
+
def ddim_sample_loop(
|
626 |
+
self,
|
627 |
+
model,
|
628 |
+
shape,
|
629 |
+
noise=None,
|
630 |
+
clip_denoised=True,
|
631 |
+
denoised_fn=None,
|
632 |
+
cond_fn=None,
|
633 |
+
model_kwargs=None,
|
634 |
+
device=None,
|
635 |
+
progress=False,
|
636 |
+
eta=0.0,
|
637 |
+
):
|
638 |
+
"""
|
639 |
+
Generate samples from the model using DDIM.
|
640 |
+
|
641 |
+
Same usage as p_sample_loop().
|
642 |
+
"""
|
643 |
+
final = None
|
644 |
+
for sample in self.ddim_sample_loop_progressive(
|
645 |
+
model,
|
646 |
+
shape,
|
647 |
+
noise=noise,
|
648 |
+
clip_denoised=clip_denoised,
|
649 |
+
denoised_fn=denoised_fn,
|
650 |
+
cond_fn=cond_fn,
|
651 |
+
model_kwargs=model_kwargs,
|
652 |
+
device=device,
|
653 |
+
progress=progress,
|
654 |
+
eta=eta,
|
655 |
+
):
|
656 |
+
final = sample
|
657 |
+
return final["sample"]
|
658 |
+
|
659 |
+
def ddim_sample_loop_progressive(
|
660 |
+
self,
|
661 |
+
model,
|
662 |
+
shape,
|
663 |
+
noise=None,
|
664 |
+
clip_denoised=True,
|
665 |
+
denoised_fn=None,
|
666 |
+
cond_fn=None,
|
667 |
+
model_kwargs=None,
|
668 |
+
device=None,
|
669 |
+
progress=False,
|
670 |
+
eta=0.0,
|
671 |
+
):
|
672 |
+
"""
|
673 |
+
Use DDIM to sample from the model and yield intermediate samples from
|
674 |
+
each timestep of DDIM.
|
675 |
+
|
676 |
+
Same usage as p_sample_loop_progressive().
|
677 |
+
"""
|
678 |
+
if device is None:
|
679 |
+
device = next(model.parameters()).device
|
680 |
+
assert isinstance(shape, (tuple, list))
|
681 |
+
if noise is not None:
|
682 |
+
img = noise
|
683 |
+
else:
|
684 |
+
img = th.randn(*shape, device=device)
|
685 |
+
indices = list(range(self.num_timesteps))[::-1]
|
686 |
+
|
687 |
+
if progress:
|
688 |
+
# Lazy import so that we don't depend on tqdm.
|
689 |
+
from tqdm.auto import tqdm
|
690 |
+
|
691 |
+
indices = tqdm(indices)
|
692 |
+
|
693 |
+
for i in indices:
|
694 |
+
t = th.tensor([i] * shape[0], device=device)
|
695 |
+
with th.no_grad():
|
696 |
+
out = self.ddim_sample(
|
697 |
+
model,
|
698 |
+
img,
|
699 |
+
t,
|
700 |
+
clip_denoised=clip_denoised,
|
701 |
+
denoised_fn=denoised_fn,
|
702 |
+
cond_fn=cond_fn,
|
703 |
+
model_kwargs=model_kwargs,
|
704 |
+
eta=eta,
|
705 |
+
)
|
706 |
+
yield out
|
707 |
+
img = out["sample"]
|
708 |
+
|
709 |
+
def _vb_terms_bpd(
|
710 |
+
self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None
|
711 |
+
):
|
712 |
+
"""
|
713 |
+
Get a term for the variational lower-bound.
|
714 |
+
|
715 |
+
The resulting units are bits (rather than nats, as one might expect).
|
716 |
+
This allows for comparison to other papers.
|
717 |
+
|
718 |
+
:return: a dict with the following keys:
|
719 |
+
- 'output': a shape [N] tensor of NLLs or KLs.
|
720 |
+
- 'pred_xstart': the x_0 predictions.
|
721 |
+
"""
|
722 |
+
true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(
|
723 |
+
x_start=x_start, x_t=x_t, t=t
|
724 |
+
)
|
725 |
+
out = self.p_mean_variance(
|
726 |
+
model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
|
727 |
+
)
|
728 |
+
kl = normal_kl(
|
729 |
+
true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
|
730 |
+
)
|
731 |
+
kl = mean_flat(kl) / np.log(2.0)
|
732 |
+
|
733 |
+
decoder_nll = -discretized_gaussian_log_likelihood(
|
734 |
+
x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
|
735 |
+
)
|
736 |
+
assert decoder_nll.shape == x_start.shape
|
737 |
+
decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
|
738 |
+
|
739 |
+
# At the first timestep return the decoder NLL,
|
740 |
+
# otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
|
741 |
+
output = th.where((t == 0), decoder_nll, kl)
|
742 |
+
return {"output": output, "pred_xstart": out["pred_xstart"]}
|
743 |
+
|
744 |
+
def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
|
745 |
+
"""
|
746 |
+
Compute training losses for a single timestep.
|
747 |
+
|
748 |
+
:param model: the model to evaluate loss on.
|
749 |
+
:param x_start: the [N x C x ...] tensor of inputs.
|
750 |
+
:param t: a batch of timestep indices.
|
751 |
+
:param model_kwargs: if not None, a dict of extra keyword arguments to
|
752 |
+
pass to the model. This can be used for conditioning.
|
753 |
+
:param noise: if specified, the specific Gaussian noise to try to remove.
|
754 |
+
:return: a dict with the key "loss" containing a tensor of shape [N].
|
755 |
+
Some mean or variance settings may also have other keys.
|
756 |
+
"""
|
757 |
+
if model_kwargs is None:
|
758 |
+
model_kwargs = {}
|
759 |
+
if noise is None:
|
760 |
+
noise = th.randn_like(x_start)
|
761 |
+
x_t = self.q_sample(x_start, t, noise=noise)
|
762 |
+
|
763 |
+
terms = {}
|
764 |
+
|
765 |
+
if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
|
766 |
+
terms["loss"] = self._vb_terms_bpd(
|
767 |
+
model=model,
|
768 |
+
x_start=x_start,
|
769 |
+
x_t=x_t,
|
770 |
+
t=t,
|
771 |
+
clip_denoised=False,
|
772 |
+
model_kwargs=model_kwargs,
|
773 |
+
)["output"]
|
774 |
+
if self.loss_type == LossType.RESCALED_KL:
|
775 |
+
terms["loss"] *= self.num_timesteps
|
776 |
+
elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
|
777 |
+
model_output = model(x_t, self._scale_timesteps(t), **model_kwargs)
|
778 |
+
|
779 |
+
if self.model_var_type in [
|
780 |
+
ModelVarType.LEARNED,
|
781 |
+
ModelVarType.LEARNED_RANGE,
|
782 |
+
]:
|
783 |
+
B, C = x_t.shape[:2]
|
784 |
+
assert model_output.shape == (B, C * 2, *x_t.shape[2:])
|
785 |
+
model_output, model_var_values = th.split(model_output, C, dim=1)
|
786 |
+
# Learn the variance using the variational bound, but don't let
|
787 |
+
# it affect our mean prediction.
|
788 |
+
frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
|
789 |
+
terms["vb"] = self._vb_terms_bpd(
|
790 |
+
model=lambda *args, r=frozen_out: r,
|
791 |
+
x_start=x_start,
|
792 |
+
x_t=x_t,
|
793 |
+
t=t,
|
794 |
+
clip_denoised=False,
|
795 |
+
)["output"]
|
796 |
+
if self.loss_type == LossType.RESCALED_MSE:
|
797 |
+
# Divide by 1000 for equivalence with initial implementation.
|
798 |
+
# Without a factor of 1/1000, the VB term hurts the MSE term.
|
799 |
+
terms["vb"] *= self.num_timesteps / 1000.0
|
800 |
+
|
801 |
+
target = {
|
802 |
+
ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(
|
803 |
+
x_start=x_start, x_t=x_t, t=t
|
804 |
+
)[0],
|
805 |
+
ModelMeanType.START_X: x_start,
|
806 |
+
ModelMeanType.EPSILON: noise,
|
807 |
+
}[self.model_mean_type]
|
808 |
+
assert model_output.shape == target.shape == x_start.shape
|
809 |
+
terms["mse"] = mean_flat((target - model_output) ** 2)
|
810 |
+
if "vb" in terms:
|
811 |
+
terms["loss"] = terms["mse"] + terms["vb"]
|
812 |
+
else:
|
813 |
+
terms["loss"] = terms["mse"]
|
814 |
+
else:
|
815 |
+
raise NotImplementedError(self.loss_type)
|
816 |
+
|
817 |
+
return terms
|
818 |
+
|
819 |
+
def _prior_bpd(self, x_start):
|
820 |
+
"""
|
821 |
+
Get the prior KL term for the variational lower-bound, measured in
|
822 |
+
bits-per-dim.
|
823 |
+
|
824 |
+
This term can't be optimized, as it only depends on the encoder.
|
825 |
+
|
826 |
+
:param x_start: the [N x C x ...] tensor of inputs.
|
827 |
+
:return: a batch of [N] KL values (in bits), one per batch element.
|
828 |
+
"""
|
829 |
+
batch_size = x_start.shape[0]
|
830 |
+
t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
|
831 |
+
qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
|
832 |
+
kl_prior = normal_kl(
|
833 |
+
mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
|
834 |
+
)
|
835 |
+
return mean_flat(kl_prior) / np.log(2.0)
|
836 |
+
|
837 |
+
def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
|
838 |
+
"""
|
839 |
+
Compute the entire variational lower-bound, measured in bits-per-dim,
|
840 |
+
as well as other related quantities.
|
841 |
+
|
842 |
+
:param model: the model to evaluate loss on.
|
843 |
+
:param x_start: the [N x C x ...] tensor of inputs.
|
844 |
+
:param clip_denoised: if True, clip denoised samples.
|
845 |
+
:param model_kwargs: if not None, a dict of extra keyword arguments to
|
846 |
+
pass to the model. This can be used for conditioning.
|
847 |
+
|
848 |
+
:return: a dict containing the following keys:
|
849 |
+
- total_bpd: the total variational lower-bound, per batch element.
|
850 |
+
- prior_bpd: the prior term in the lower-bound.
|
851 |
+
- vb: an [N x T] tensor of terms in the lower-bound.
|
852 |
+
- xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
|
853 |
+
- mse: an [N x T] tensor of epsilon MSEs for each timestep.
|
854 |
+
"""
|
855 |
+
device = x_start.device
|
856 |
+
batch_size = x_start.shape[0]
|
857 |
+
|
858 |
+
vb = []
|
859 |
+
xstart_mse = []
|
860 |
+
mse = []
|
861 |
+
for t in list(range(self.num_timesteps))[::-1]:
|
862 |
+
t_batch = th.tensor([t] * batch_size, device=device)
|
863 |
+
noise = th.randn_like(x_start)
|
864 |
+
x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
|
865 |
+
# Calculate VLB term at the current timestep
|
866 |
+
with th.no_grad():
|
867 |
+
out = self._vb_terms_bpd(
|
868 |
+
model,
|
869 |
+
x_start=x_start,
|
870 |
+
x_t=x_t,
|
871 |
+
t=t_batch,
|
872 |
+
clip_denoised=clip_denoised,
|
873 |
+
model_kwargs=model_kwargs,
|
874 |
+
)
|
875 |
+
vb.append(out["output"])
|
876 |
+
xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
|
877 |
+
eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
|
878 |
+
mse.append(mean_flat((eps - noise) ** 2))
|
879 |
+
|
880 |
+
vb = th.stack(vb, dim=1)
|
881 |
+
xstart_mse = th.stack(xstart_mse, dim=1)
|
882 |
+
mse = th.stack(mse, dim=1)
|
883 |
+
|
884 |
+
prior_bpd = self._prior_bpd(x_start)
|
885 |
+
total_bpd = vb.sum(dim=1) + prior_bpd
|
886 |
+
return {
|
887 |
+
"total_bpd": total_bpd,
|
888 |
+
"prior_bpd": prior_bpd,
|
889 |
+
"vb": vb,
|
890 |
+
"xstart_mse": xstart_mse,
|
891 |
+
"mse": mse,
|
892 |
+
}
|
893 |
+
|
894 |
+
|
895 |
+
def _extract_into_tensor(arr, timesteps, broadcast_shape):
|
896 |
+
"""
|
897 |
+
Extract values from a 1-D numpy array for a batch of indices.
|
898 |
+
|
899 |
+
:param arr: the 1-D numpy array.
|
900 |
+
:param timesteps: a tensor of indices into the array to extract.
|
901 |
+
:param broadcast_shape: a larger shape of K dimensions with the batch
|
902 |
+
dimension equal to the length of timesteps.
|
903 |
+
:return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
|
904 |
+
"""
|
905 |
+
res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
|
906 |
+
while len(res.shape) < len(broadcast_shape):
|
907 |
+
res = res[..., None]
|
908 |
+
return res.expand(broadcast_shape)
|
guided_diffusion/losses.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Helpers for various likelihood-based losses. These are ported from the original
|
3 |
+
Ho et al. diffusion models codebase:
|
4 |
+
https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/utils.py
|
5 |
+
"""
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
import torch as th
|
10 |
+
|
11 |
+
|
12 |
+
def normal_kl(mean1, logvar1, mean2, logvar2):
|
13 |
+
"""
|
14 |
+
Compute the KL divergence between two gaussians.
|
15 |
+
|
16 |
+
Shapes are automatically broadcasted, so batches can be compared to
|
17 |
+
scalars, among other use cases.
|
18 |
+
"""
|
19 |
+
tensor = None
|
20 |
+
for obj in (mean1, logvar1, mean2, logvar2):
|
21 |
+
if isinstance(obj, th.Tensor):
|
22 |
+
tensor = obj
|
23 |
+
break
|
24 |
+
assert tensor is not None, "at least one argument must be a Tensor"
|
25 |
+
|
26 |
+
# Force variances to be Tensors. Broadcasting helps convert scalars to
|
27 |
+
# Tensors, but it does not work for th.exp().
|
28 |
+
logvar1, logvar2 = [
|
29 |
+
x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor)
|
30 |
+
for x in (logvar1, logvar2)
|
31 |
+
]
|
32 |
+
|
33 |
+
return 0.5 * (
|
34 |
+
-1.0
|
35 |
+
+ logvar2
|
36 |
+
- logvar1
|
37 |
+
+ th.exp(logvar1 - logvar2)
|
38 |
+
+ ((mean1 - mean2) ** 2) * th.exp(-logvar2)
|
39 |
+
)
|
40 |
+
|
41 |
+
|
42 |
+
def approx_standard_normal_cdf(x):
|
43 |
+
"""
|
44 |
+
A fast approximation of the cumulative distribution function of the
|
45 |
+
standard normal.
|
46 |
+
"""
|
47 |
+
return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
|
48 |
+
|
49 |
+
|
50 |
+
def discretized_gaussian_log_likelihood(x, *, means, log_scales):
|
51 |
+
"""
|
52 |
+
Compute the log-likelihood of a Gaussian distribution discretizing to a
|
53 |
+
given image.
|
54 |
+
|
55 |
+
:param x: the target images. It is assumed that this was uint8 values,
|
56 |
+
rescaled to the range [-1, 1].
|
57 |
+
:param means: the Gaussian mean Tensor.
|
58 |
+
:param log_scales: the Gaussian log stddev Tensor.
|
59 |
+
:return: a tensor like x of log probabilities (in nats).
|
60 |
+
"""
|
61 |
+
assert x.shape == means.shape == log_scales.shape
|
62 |
+
centered_x = x - means
|
63 |
+
inv_stdv = th.exp(-log_scales)
|
64 |
+
plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
|
65 |
+
cdf_plus = approx_standard_normal_cdf(plus_in)
|
66 |
+
min_in = inv_stdv * (centered_x - 1.0 / 255.0)
|
67 |
+
cdf_min = approx_standard_normal_cdf(min_in)
|
68 |
+
log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
|
69 |
+
log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
|
70 |
+
cdf_delta = cdf_plus - cdf_min
|
71 |
+
log_probs = th.where(
|
72 |
+
x < -0.999,
|
73 |
+
log_cdf_plus,
|
74 |
+
th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
|
75 |
+
)
|
76 |
+
assert log_probs.shape == x.shape
|
77 |
+
return log_probs
|
guided_diffusion/nn.py
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Various utilities for neural networks.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import math
|
6 |
+
|
7 |
+
import torch as th
|
8 |
+
import torch.nn as nn
|
9 |
+
|
10 |
+
|
11 |
+
# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
|
12 |
+
class SiLU(nn.Module):
|
13 |
+
def forward(self, x):
|
14 |
+
return x * th.sigmoid(x)
|
15 |
+
|
16 |
+
|
17 |
+
class GroupNorm32(nn.GroupNorm):
|
18 |
+
def forward(self, x):
|
19 |
+
return super().forward(x.float()).type(x.dtype)
|
20 |
+
|
21 |
+
|
22 |
+
def conv_nd(dims, *args, **kwargs):
|
23 |
+
"""
|
24 |
+
Create a 1D, 2D, or 3D convolution module.
|
25 |
+
"""
|
26 |
+
if dims == 1:
|
27 |
+
return nn.Conv1d(*args, **kwargs)
|
28 |
+
elif dims == 2:
|
29 |
+
return nn.Conv2d(*args, **kwargs)
|
30 |
+
elif dims == 3:
|
31 |
+
return nn.Conv3d(*args, **kwargs)
|
32 |
+
raise ValueError(f"unsupported dimensions: {dims}")
|
33 |
+
|
34 |
+
|
35 |
+
def linear(*args, **kwargs):
|
36 |
+
"""
|
37 |
+
Create a linear module.
|
38 |
+
"""
|
39 |
+
return nn.Linear(*args, **kwargs)
|
40 |
+
|
41 |
+
|
42 |
+
def avg_pool_nd(dims, *args, **kwargs):
|
43 |
+
"""
|
44 |
+
Create a 1D, 2D, or 3D average pooling module.
|
45 |
+
"""
|
46 |
+
if dims == 1:
|
47 |
+
return nn.AvgPool1d(*args, **kwargs)
|
48 |
+
elif dims == 2:
|
49 |
+
return nn.AvgPool2d(*args, **kwargs)
|
50 |
+
elif dims == 3:
|
51 |
+
return nn.AvgPool3d(*args, **kwargs)
|
52 |
+
raise ValueError(f"unsupported dimensions: {dims}")
|
53 |
+
|
54 |
+
|
55 |
+
def update_ema(target_params, source_params, rate=0.99):
|
56 |
+
"""
|
57 |
+
Update target parameters to be closer to those of source parameters using
|
58 |
+
an exponential moving average.
|
59 |
+
|
60 |
+
:param target_params: the target parameter sequence.
|
61 |
+
:param source_params: the source parameter sequence.
|
62 |
+
:param rate: the EMA rate (closer to 1 means slower).
|
63 |
+
"""
|
64 |
+
for targ, src in zip(target_params, source_params):
|
65 |
+
targ.detach().mul_(rate).add_(src, alpha=1 - rate)
|
66 |
+
|
67 |
+
|
68 |
+
def zero_module(module):
|
69 |
+
"""
|
70 |
+
Zero out the parameters of a module and return it.
|
71 |
+
"""
|
72 |
+
for p in module.parameters():
|
73 |
+
p.detach().zero_()
|
74 |
+
return module
|
75 |
+
|
76 |
+
|
77 |
+
def scale_module(module, scale):
|
78 |
+
"""
|
79 |
+
Scale the parameters of a module and return it.
|
80 |
+
"""
|
81 |
+
for p in module.parameters():
|
82 |
+
p.detach().mul_(scale)
|
83 |
+
return module
|
84 |
+
|
85 |
+
|
86 |
+
def mean_flat(tensor):
|
87 |
+
"""
|
88 |
+
Take the mean over all non-batch dimensions.
|
89 |
+
"""
|
90 |
+
return tensor.mean(dim=list(range(1, len(tensor.shape))))
|
91 |
+
|
92 |
+
|
93 |
+
def normalization(channels):
|
94 |
+
"""
|
95 |
+
Make a standard normalization layer.
|
96 |
+
|
97 |
+
:param channels: number of input channels.
|
98 |
+
:return: an nn.Module for normalization.
|
99 |
+
"""
|
100 |
+
return GroupNorm32(32, channels)
|
101 |
+
|
102 |
+
|
103 |
+
def timestep_embedding(timesteps, dim, max_period=10000):
|
104 |
+
"""
|
105 |
+
Create sinusoidal timestep embeddings.
|
106 |
+
|
107 |
+
:param timesteps: a 1-D Tensor of N indices, one per batch element.
|
108 |
+
These may be fractional.
|
109 |
+
:param dim: the dimension of the output.
|
110 |
+
:param max_period: controls the minimum frequency of the embeddings.
|
111 |
+
:return: an [N x dim] Tensor of positional embeddings.
|
112 |
+
"""
|
113 |
+
half = dim // 2
|
114 |
+
freqs = th.exp(
|
115 |
+
-math.log(max_period) * th.arange(start=0, end=half, dtype=th.float32) / half
|
116 |
+
).to(device=timesteps.device)
|
117 |
+
args = timesteps[:, None].float() * freqs[None]
|
118 |
+
embedding = th.cat([th.cos(args), th.sin(args)], dim=-1)
|
119 |
+
if dim % 2:
|
120 |
+
embedding = th.cat([embedding, th.zeros_like(embedding[:, :1])], dim=-1)
|
121 |
+
return embedding
|
122 |
+
|
123 |
+
|
124 |
+
def checkpoint(func, inputs, params, flag):
|
125 |
+
"""
|
126 |
+
Evaluate a function without caching intermediate activations, allowing for
|
127 |
+
reduced memory at the expense of extra compute in the backward pass.
|
128 |
+
|
129 |
+
:param func: the function to evaluate.
|
130 |
+
:param inputs: the argument sequence to pass to `func`.
|
131 |
+
:param params: a sequence of parameters `func` depends on but does not
|
132 |
+
explicitly take as arguments.
|
133 |
+
:param flag: if False, disable gradient checkpointing.
|
134 |
+
"""
|
135 |
+
if flag:
|
136 |
+
args = tuple(inputs) + tuple(params)
|
137 |
+
return CheckpointFunction.apply(func, len(inputs), *args)
|
138 |
+
else:
|
139 |
+
return func(*inputs)
|
140 |
+
|
141 |
+
|
142 |
+
class CheckpointFunction(th.autograd.Function):
|
143 |
+
@staticmethod
|
144 |
+
def forward(ctx, run_function, length, *args):
|
145 |
+
ctx.run_function = run_function
|
146 |
+
ctx.input_tensors = list(args[:length])
|
147 |
+
ctx.input_params = list(args[length:])
|
148 |
+
with th.no_grad():
|
149 |
+
output_tensors = ctx.run_function(*ctx.input_tensors)
|
150 |
+
return output_tensors
|
151 |
+
|
152 |
+
@staticmethod
|
153 |
+
def backward(ctx, *output_grads):
|
154 |
+
ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
|
155 |
+
with th.enable_grad():
|
156 |
+
# Fixes a bug where the first op in run_function modifies the
|
157 |
+
# Tensor storage in place, which is not allowed for detach()'d
|
158 |
+
# Tensors.
|
159 |
+
shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
|
160 |
+
output_tensors = ctx.run_function(*shallow_copies)
|
161 |
+
input_grads = th.autograd.grad(
|
162 |
+
output_tensors,
|
163 |
+
ctx.input_tensors + ctx.input_params,
|
164 |
+
output_grads,
|
165 |
+
allow_unused=True,
|
166 |
+
)
|
167 |
+
del ctx.input_tensors
|
168 |
+
del ctx.input_params
|
169 |
+
del output_tensors
|
170 |
+
return (None, None) + input_grads
|
guided_diffusion/respace.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch as th
|
3 |
+
|
4 |
+
from .gaussian_diffusion import GaussianDiffusion
|
5 |
+
|
6 |
+
|
7 |
+
def space_timesteps(num_timesteps, section_counts):
|
8 |
+
"""
|
9 |
+
Create a list of timesteps to use from an original diffusion process,
|
10 |
+
given the number of timesteps we want to take from equally-sized portions
|
11 |
+
of the original process.
|
12 |
+
|
13 |
+
For example, if there's 300 timesteps and the section counts are [10,15,20]
|
14 |
+
then the first 100 timesteps are strided to be 10 timesteps, the second 100
|
15 |
+
are strided to be 15 timesteps, and the final 100 are strided to be 20.
|
16 |
+
|
17 |
+
If the stride is a string starting with "ddim", then the fixed striding
|
18 |
+
from the DDIM paper is used, and only one section is allowed.
|
19 |
+
|
20 |
+
:param num_timesteps: the number of diffusion steps in the original
|
21 |
+
process to divide up.
|
22 |
+
:param section_counts: either a list of numbers, or a string containing
|
23 |
+
comma-separated numbers, indicating the step count
|
24 |
+
per section. As a special case, use "ddimN" where N
|
25 |
+
is a number of steps to use the striding from the
|
26 |
+
DDIM paper.
|
27 |
+
:return: a set of diffusion steps from the original process to use.
|
28 |
+
"""
|
29 |
+
if isinstance(section_counts, str):
|
30 |
+
if section_counts.startswith("ddim"):
|
31 |
+
desired_count = int(section_counts[len("ddim") :])
|
32 |
+
for i in range(1, num_timesteps):
|
33 |
+
if len(range(0, num_timesteps, i)) == desired_count:
|
34 |
+
return set(range(0, num_timesteps, i))
|
35 |
+
raise ValueError(
|
36 |
+
f"cannot create exactly {num_timesteps} steps with an integer stride"
|
37 |
+
)
|
38 |
+
section_counts = [int(x) for x in section_counts.split(",")]
|
39 |
+
size_per = num_timesteps // len(section_counts)
|
40 |
+
extra = num_timesteps % len(section_counts)
|
41 |
+
start_idx = 0
|
42 |
+
all_steps = []
|
43 |
+
for i, section_count in enumerate(section_counts):
|
44 |
+
size = size_per + (1 if i < extra else 0)
|
45 |
+
if size < section_count:
|
46 |
+
raise ValueError(
|
47 |
+
f"cannot divide section of {size} steps into {section_count}"
|
48 |
+
)
|
49 |
+
if section_count <= 1:
|
50 |
+
frac_stride = 1
|
51 |
+
else:
|
52 |
+
frac_stride = (size - 1) / (section_count - 1)
|
53 |
+
cur_idx = 0.0
|
54 |
+
taken_steps = []
|
55 |
+
for _ in range(section_count):
|
56 |
+
taken_steps.append(start_idx + round(cur_idx))
|
57 |
+
cur_idx += frac_stride
|
58 |
+
all_steps += taken_steps
|
59 |
+
start_idx += size
|
60 |
+
return set(all_steps)
|
61 |
+
|
62 |
+
|
63 |
+
class SpacedDiffusion(GaussianDiffusion):
|
64 |
+
"""
|
65 |
+
A diffusion process which can skip steps in a base diffusion process.
|
66 |
+
|
67 |
+
:param use_timesteps: a collection (sequence or set) of timesteps from the
|
68 |
+
original diffusion process to retain.
|
69 |
+
:param kwargs: the kwargs to create the base diffusion process.
|
70 |
+
"""
|
71 |
+
|
72 |
+
def __init__(self, use_timesteps, **kwargs):
|
73 |
+
self.use_timesteps = set(use_timesteps)
|
74 |
+
self.timestep_map = []
|
75 |
+
self.original_num_steps = len(kwargs["betas"])
|
76 |
+
|
77 |
+
base_diffusion = GaussianDiffusion(**kwargs) # pylint: disable=missing-kwoa
|
78 |
+
last_alpha_cumprod = 1.0
|
79 |
+
new_betas = []
|
80 |
+
for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
|
81 |
+
if i in self.use_timesteps:
|
82 |
+
new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
|
83 |
+
last_alpha_cumprod = alpha_cumprod
|
84 |
+
self.timestep_map.append(i)
|
85 |
+
kwargs["betas"] = np.array(new_betas)
|
86 |
+
super().__init__(**kwargs)
|
87 |
+
|
88 |
+
def p_mean_variance(
|
89 |
+
self, model, *args, **kwargs
|
90 |
+
): # pylint: disable=signature-differs
|
91 |
+
return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
|
92 |
+
|
93 |
+
def training_losses(
|
94 |
+
self, model, *args, **kwargs
|
95 |
+
): # pylint: disable=signature-differs
|
96 |
+
return super().training_losses(self._wrap_model(model), *args, **kwargs)
|
97 |
+
|
98 |
+
def condition_mean(self, cond_fn, *args, **kwargs):
|
99 |
+
return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
|
100 |
+
|
101 |
+
def condition_score(self, cond_fn, *args, **kwargs):
|
102 |
+
return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
|
103 |
+
|
104 |
+
def _wrap_model(self, model):
|
105 |
+
if isinstance(model, _WrappedModel):
|
106 |
+
return model
|
107 |
+
return _WrappedModel(
|
108 |
+
model, self.timestep_map, self.rescale_timesteps, self.original_num_steps
|
109 |
+
)
|
110 |
+
|
111 |
+
def _scale_timesteps(self, t):
|
112 |
+
# Scaling is done by the wrapped model.
|
113 |
+
return t
|
114 |
+
|
115 |
+
|
116 |
+
class _WrappedModel:
|
117 |
+
def __init__(self, model, timestep_map, rescale_timesteps, original_num_steps):
|
118 |
+
self.model = model
|
119 |
+
self.timestep_map = timestep_map
|
120 |
+
self.rescale_timesteps = rescale_timesteps
|
121 |
+
self.original_num_steps = original_num_steps
|
122 |
+
|
123 |
+
def __call__(self, x, ts, **kwargs):
|
124 |
+
map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
|
125 |
+
new_ts = map_tensor[ts]
|
126 |
+
if self.rescale_timesteps:
|
127 |
+
new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
|
128 |
+
return self.model(x, new_ts, **kwargs)
|
guided_diffusion/script_util.py
ADDED
@@ -0,0 +1,452 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import inspect
|
3 |
+
|
4 |
+
from . import gaussian_diffusion as gd
|
5 |
+
from .respace import SpacedDiffusion, space_timesteps
|
6 |
+
from .unet import SuperResModel, UNetModel, EncoderUNetModel
|
7 |
+
|
8 |
+
NUM_CLASSES = 1000
|
9 |
+
|
10 |
+
|
11 |
+
def diffusion_defaults():
|
12 |
+
"""
|
13 |
+
Defaults for image and classifier training.
|
14 |
+
"""
|
15 |
+
return dict(
|
16 |
+
learn_sigma=False,
|
17 |
+
diffusion_steps=1000,
|
18 |
+
noise_schedule="linear",
|
19 |
+
timestep_respacing="",
|
20 |
+
use_kl=False,
|
21 |
+
predict_xstart=False,
|
22 |
+
rescale_timesteps=False,
|
23 |
+
rescale_learned_sigmas=False,
|
24 |
+
)
|
25 |
+
|
26 |
+
|
27 |
+
def classifier_defaults():
|
28 |
+
"""
|
29 |
+
Defaults for classifier models.
|
30 |
+
"""
|
31 |
+
return dict(
|
32 |
+
image_size=64,
|
33 |
+
classifier_use_fp16=False,
|
34 |
+
classifier_width=128,
|
35 |
+
classifier_depth=2,
|
36 |
+
classifier_attention_resolutions="32,16,8", # 16
|
37 |
+
classifier_use_scale_shift_norm=True, # False
|
38 |
+
classifier_resblock_updown=True, # False
|
39 |
+
classifier_pool="attention",
|
40 |
+
)
|
41 |
+
|
42 |
+
|
43 |
+
def model_and_diffusion_defaults():
|
44 |
+
"""
|
45 |
+
Defaults for image training.
|
46 |
+
"""
|
47 |
+
res = dict(
|
48 |
+
image_size=64,
|
49 |
+
num_channels=128,
|
50 |
+
num_res_blocks=2,
|
51 |
+
num_heads=4,
|
52 |
+
num_heads_upsample=-1,
|
53 |
+
num_head_channels=-1,
|
54 |
+
attention_resolutions="16,8",
|
55 |
+
channel_mult="",
|
56 |
+
dropout=0.0,
|
57 |
+
class_cond=False,
|
58 |
+
use_checkpoint=False,
|
59 |
+
use_scale_shift_norm=True,
|
60 |
+
resblock_updown=False,
|
61 |
+
use_fp16=False,
|
62 |
+
use_new_attention_order=False,
|
63 |
+
)
|
64 |
+
res.update(diffusion_defaults())
|
65 |
+
return res
|
66 |
+
|
67 |
+
|
68 |
+
def classifier_and_diffusion_defaults():
|
69 |
+
res = classifier_defaults()
|
70 |
+
res.update(diffusion_defaults())
|
71 |
+
return res
|
72 |
+
|
73 |
+
|
74 |
+
def create_model_and_diffusion(
|
75 |
+
image_size,
|
76 |
+
class_cond,
|
77 |
+
learn_sigma,
|
78 |
+
num_channels,
|
79 |
+
num_res_blocks,
|
80 |
+
channel_mult,
|
81 |
+
num_heads,
|
82 |
+
num_head_channels,
|
83 |
+
num_heads_upsample,
|
84 |
+
attention_resolutions,
|
85 |
+
dropout,
|
86 |
+
diffusion_steps,
|
87 |
+
noise_schedule,
|
88 |
+
timestep_respacing,
|
89 |
+
use_kl,
|
90 |
+
predict_xstart,
|
91 |
+
rescale_timesteps,
|
92 |
+
rescale_learned_sigmas,
|
93 |
+
use_checkpoint,
|
94 |
+
use_scale_shift_norm,
|
95 |
+
resblock_updown,
|
96 |
+
use_fp16,
|
97 |
+
use_new_attention_order,
|
98 |
+
):
|
99 |
+
model = create_model(
|
100 |
+
image_size,
|
101 |
+
num_channels,
|
102 |
+
num_res_blocks,
|
103 |
+
channel_mult=channel_mult,
|
104 |
+
learn_sigma=learn_sigma,
|
105 |
+
class_cond=class_cond,
|
106 |
+
use_checkpoint=use_checkpoint,
|
107 |
+
attention_resolutions=attention_resolutions,
|
108 |
+
num_heads=num_heads,
|
109 |
+
num_head_channels=num_head_channels,
|
110 |
+
num_heads_upsample=num_heads_upsample,
|
111 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
112 |
+
dropout=dropout,
|
113 |
+
resblock_updown=resblock_updown,
|
114 |
+
use_fp16=use_fp16,
|
115 |
+
use_new_attention_order=use_new_attention_order,
|
116 |
+
)
|
117 |
+
diffusion = create_gaussian_diffusion(
|
118 |
+
steps=diffusion_steps,
|
119 |
+
learn_sigma=learn_sigma,
|
120 |
+
noise_schedule=noise_schedule,
|
121 |
+
use_kl=use_kl,
|
122 |
+
predict_xstart=predict_xstart,
|
123 |
+
rescale_timesteps=rescale_timesteps,
|
124 |
+
rescale_learned_sigmas=rescale_learned_sigmas,
|
125 |
+
timestep_respacing=timestep_respacing,
|
126 |
+
)
|
127 |
+
return model, diffusion
|
128 |
+
|
129 |
+
|
130 |
+
def create_model(
|
131 |
+
image_size,
|
132 |
+
num_channels,
|
133 |
+
num_res_blocks,
|
134 |
+
channel_mult="",
|
135 |
+
learn_sigma=False,
|
136 |
+
class_cond=False,
|
137 |
+
use_checkpoint=False,
|
138 |
+
attention_resolutions="16",
|
139 |
+
num_heads=1,
|
140 |
+
num_head_channels=-1,
|
141 |
+
num_heads_upsample=-1,
|
142 |
+
use_scale_shift_norm=False,
|
143 |
+
dropout=0,
|
144 |
+
resblock_updown=False,
|
145 |
+
use_fp16=False,
|
146 |
+
use_new_attention_order=False,
|
147 |
+
):
|
148 |
+
if channel_mult == "":
|
149 |
+
if image_size == 512:
|
150 |
+
channel_mult = (0.5, 1, 1, 2, 2, 4, 4)
|
151 |
+
elif image_size == 256:
|
152 |
+
channel_mult = (1, 1, 2, 2, 4, 4)
|
153 |
+
elif image_size == 128:
|
154 |
+
channel_mult = (1, 1, 2, 3, 4)
|
155 |
+
elif image_size == 64:
|
156 |
+
channel_mult = (1, 2, 3, 4)
|
157 |
+
else:
|
158 |
+
raise ValueError(f"unsupported image size: {image_size}")
|
159 |
+
else:
|
160 |
+
channel_mult = tuple(int(ch_mult) for ch_mult in channel_mult.split(","))
|
161 |
+
|
162 |
+
attention_ds = []
|
163 |
+
for res in attention_resolutions.split(","):
|
164 |
+
attention_ds.append(image_size // int(res))
|
165 |
+
|
166 |
+
return UNetModel(
|
167 |
+
image_size=image_size,
|
168 |
+
in_channels=3,
|
169 |
+
model_channels=num_channels,
|
170 |
+
out_channels=(3 if not learn_sigma else 6),
|
171 |
+
num_res_blocks=num_res_blocks,
|
172 |
+
attention_resolutions=tuple(attention_ds),
|
173 |
+
dropout=dropout,
|
174 |
+
channel_mult=channel_mult,
|
175 |
+
num_classes=(NUM_CLASSES if class_cond else None),
|
176 |
+
use_checkpoint=use_checkpoint,
|
177 |
+
use_fp16=use_fp16,
|
178 |
+
num_heads=num_heads,
|
179 |
+
num_head_channels=num_head_channels,
|
180 |
+
num_heads_upsample=num_heads_upsample,
|
181 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
182 |
+
resblock_updown=resblock_updown,
|
183 |
+
use_new_attention_order=use_new_attention_order,
|
184 |
+
)
|
185 |
+
|
186 |
+
|
187 |
+
def create_classifier_and_diffusion(
|
188 |
+
image_size,
|
189 |
+
classifier_use_fp16,
|
190 |
+
classifier_width,
|
191 |
+
classifier_depth,
|
192 |
+
classifier_attention_resolutions,
|
193 |
+
classifier_use_scale_shift_norm,
|
194 |
+
classifier_resblock_updown,
|
195 |
+
classifier_pool,
|
196 |
+
learn_sigma,
|
197 |
+
diffusion_steps,
|
198 |
+
noise_schedule,
|
199 |
+
timestep_respacing,
|
200 |
+
use_kl,
|
201 |
+
predict_xstart,
|
202 |
+
rescale_timesteps,
|
203 |
+
rescale_learned_sigmas,
|
204 |
+
):
|
205 |
+
classifier = create_classifier(
|
206 |
+
image_size,
|
207 |
+
classifier_use_fp16,
|
208 |
+
classifier_width,
|
209 |
+
classifier_depth,
|
210 |
+
classifier_attention_resolutions,
|
211 |
+
classifier_use_scale_shift_norm,
|
212 |
+
classifier_resblock_updown,
|
213 |
+
classifier_pool,
|
214 |
+
)
|
215 |
+
diffusion = create_gaussian_diffusion(
|
216 |
+
steps=diffusion_steps,
|
217 |
+
learn_sigma=learn_sigma,
|
218 |
+
noise_schedule=noise_schedule,
|
219 |
+
use_kl=use_kl,
|
220 |
+
predict_xstart=predict_xstart,
|
221 |
+
rescale_timesteps=rescale_timesteps,
|
222 |
+
rescale_learned_sigmas=rescale_learned_sigmas,
|
223 |
+
timestep_respacing=timestep_respacing,
|
224 |
+
)
|
225 |
+
return classifier, diffusion
|
226 |
+
|
227 |
+
|
228 |
+
def create_classifier(
|
229 |
+
image_size,
|
230 |
+
classifier_use_fp16,
|
231 |
+
classifier_width,
|
232 |
+
classifier_depth,
|
233 |
+
classifier_attention_resolutions,
|
234 |
+
classifier_use_scale_shift_norm,
|
235 |
+
classifier_resblock_updown,
|
236 |
+
classifier_pool,
|
237 |
+
):
|
238 |
+
if image_size == 512:
|
239 |
+
channel_mult = (0.5, 1, 1, 2, 2, 4, 4)
|
240 |
+
elif image_size == 256:
|
241 |
+
channel_mult = (1, 1, 2, 2, 4, 4)
|
242 |
+
elif image_size == 128:
|
243 |
+
channel_mult = (1, 1, 2, 3, 4)
|
244 |
+
elif image_size == 64:
|
245 |
+
channel_mult = (1, 2, 3, 4)
|
246 |
+
else:
|
247 |
+
raise ValueError(f"unsupported image size: {image_size}")
|
248 |
+
|
249 |
+
attention_ds = []
|
250 |
+
for res in classifier_attention_resolutions.split(","):
|
251 |
+
attention_ds.append(image_size // int(res))
|
252 |
+
|
253 |
+
return EncoderUNetModel(
|
254 |
+
image_size=image_size,
|
255 |
+
in_channels=3,
|
256 |
+
model_channels=classifier_width,
|
257 |
+
out_channels=1000,
|
258 |
+
num_res_blocks=classifier_depth,
|
259 |
+
attention_resolutions=tuple(attention_ds),
|
260 |
+
channel_mult=channel_mult,
|
261 |
+
use_fp16=classifier_use_fp16,
|
262 |
+
num_head_channels=64,
|
263 |
+
use_scale_shift_norm=classifier_use_scale_shift_norm,
|
264 |
+
resblock_updown=classifier_resblock_updown,
|
265 |
+
pool=classifier_pool,
|
266 |
+
)
|
267 |
+
|
268 |
+
|
269 |
+
def sr_model_and_diffusion_defaults():
|
270 |
+
res = model_and_diffusion_defaults()
|
271 |
+
res["large_size"] = 256
|
272 |
+
res["small_size"] = 64
|
273 |
+
arg_names = inspect.getfullargspec(sr_create_model_and_diffusion)[0]
|
274 |
+
for k in res.copy().keys():
|
275 |
+
if k not in arg_names:
|
276 |
+
del res[k]
|
277 |
+
return res
|
278 |
+
|
279 |
+
|
280 |
+
def sr_create_model_and_diffusion(
|
281 |
+
large_size,
|
282 |
+
small_size,
|
283 |
+
class_cond,
|
284 |
+
learn_sigma,
|
285 |
+
num_channels,
|
286 |
+
num_res_blocks,
|
287 |
+
num_heads,
|
288 |
+
num_head_channels,
|
289 |
+
num_heads_upsample,
|
290 |
+
attention_resolutions,
|
291 |
+
dropout,
|
292 |
+
diffusion_steps,
|
293 |
+
noise_schedule,
|
294 |
+
timestep_respacing,
|
295 |
+
use_kl,
|
296 |
+
predict_xstart,
|
297 |
+
rescale_timesteps,
|
298 |
+
rescale_learned_sigmas,
|
299 |
+
use_checkpoint,
|
300 |
+
use_scale_shift_norm,
|
301 |
+
resblock_updown,
|
302 |
+
use_fp16,
|
303 |
+
):
|
304 |
+
model = sr_create_model(
|
305 |
+
large_size,
|
306 |
+
small_size,
|
307 |
+
num_channels,
|
308 |
+
num_res_blocks,
|
309 |
+
learn_sigma=learn_sigma,
|
310 |
+
class_cond=class_cond,
|
311 |
+
use_checkpoint=use_checkpoint,
|
312 |
+
attention_resolutions=attention_resolutions,
|
313 |
+
num_heads=num_heads,
|
314 |
+
num_head_channels=num_head_channels,
|
315 |
+
num_heads_upsample=num_heads_upsample,
|
316 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
317 |
+
dropout=dropout,
|
318 |
+
resblock_updown=resblock_updown,
|
319 |
+
use_fp16=use_fp16,
|
320 |
+
)
|
321 |
+
diffusion = create_gaussian_diffusion(
|
322 |
+
steps=diffusion_steps,
|
323 |
+
learn_sigma=learn_sigma,
|
324 |
+
noise_schedule=noise_schedule,
|
325 |
+
use_kl=use_kl,
|
326 |
+
predict_xstart=predict_xstart,
|
327 |
+
rescale_timesteps=rescale_timesteps,
|
328 |
+
rescale_learned_sigmas=rescale_learned_sigmas,
|
329 |
+
timestep_respacing=timestep_respacing,
|
330 |
+
)
|
331 |
+
return model, diffusion
|
332 |
+
|
333 |
+
|
334 |
+
def sr_create_model(
|
335 |
+
large_size,
|
336 |
+
small_size,
|
337 |
+
num_channels,
|
338 |
+
num_res_blocks,
|
339 |
+
learn_sigma,
|
340 |
+
class_cond,
|
341 |
+
use_checkpoint,
|
342 |
+
attention_resolutions,
|
343 |
+
num_heads,
|
344 |
+
num_head_channels,
|
345 |
+
num_heads_upsample,
|
346 |
+
use_scale_shift_norm,
|
347 |
+
dropout,
|
348 |
+
resblock_updown,
|
349 |
+
use_fp16,
|
350 |
+
):
|
351 |
+
_ = small_size # hack to prevent unused variable
|
352 |
+
|
353 |
+
if large_size == 512:
|
354 |
+
channel_mult = (1, 1, 2, 2, 4, 4)
|
355 |
+
elif large_size == 256:
|
356 |
+
channel_mult = (1, 1, 2, 2, 4, 4)
|
357 |
+
elif large_size == 64:
|
358 |
+
channel_mult = (1, 2, 3, 4)
|
359 |
+
else:
|
360 |
+
raise ValueError(f"unsupported large size: {large_size}")
|
361 |
+
|
362 |
+
attention_ds = []
|
363 |
+
for res in attention_resolutions.split(","):
|
364 |
+
attention_ds.append(large_size // int(res))
|
365 |
+
|
366 |
+
return SuperResModel(
|
367 |
+
image_size=large_size,
|
368 |
+
in_channels=3,
|
369 |
+
model_channels=num_channels,
|
370 |
+
out_channels=(3 if not learn_sigma else 6),
|
371 |
+
num_res_blocks=num_res_blocks,
|
372 |
+
attention_resolutions=tuple(attention_ds),
|
373 |
+
dropout=dropout,
|
374 |
+
channel_mult=channel_mult,
|
375 |
+
num_classes=(NUM_CLASSES if class_cond else None),
|
376 |
+
use_checkpoint=use_checkpoint,
|
377 |
+
num_heads=num_heads,
|
378 |
+
num_head_channels=num_head_channels,
|
379 |
+
num_heads_upsample=num_heads_upsample,
|
380 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
381 |
+
resblock_updown=resblock_updown,
|
382 |
+
use_fp16=use_fp16,
|
383 |
+
)
|
384 |
+
|
385 |
+
|
386 |
+
def create_gaussian_diffusion(
|
387 |
+
*,
|
388 |
+
steps=1000,
|
389 |
+
learn_sigma=False,
|
390 |
+
sigma_small=False,
|
391 |
+
noise_schedule="linear",
|
392 |
+
use_kl=False,
|
393 |
+
predict_xstart=False,
|
394 |
+
rescale_timesteps=False,
|
395 |
+
rescale_learned_sigmas=False,
|
396 |
+
timestep_respacing="",
|
397 |
+
):
|
398 |
+
betas = gd.get_named_beta_schedule(noise_schedule, steps)
|
399 |
+
if use_kl:
|
400 |
+
loss_type = gd.LossType.RESCALED_KL
|
401 |
+
elif rescale_learned_sigmas:
|
402 |
+
loss_type = gd.LossType.RESCALED_MSE
|
403 |
+
else:
|
404 |
+
loss_type = gd.LossType.MSE
|
405 |
+
if not timestep_respacing:
|
406 |
+
timestep_respacing = [steps]
|
407 |
+
return SpacedDiffusion(
|
408 |
+
use_timesteps=space_timesteps(steps, timestep_respacing),
|
409 |
+
betas=betas,
|
410 |
+
model_mean_type=(
|
411 |
+
gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X
|
412 |
+
),
|
413 |
+
model_var_type=(
|
414 |
+
(
|
415 |
+
gd.ModelVarType.FIXED_LARGE
|
416 |
+
if not sigma_small
|
417 |
+
else gd.ModelVarType.FIXED_SMALL
|
418 |
+
)
|
419 |
+
if not learn_sigma
|
420 |
+
else gd.ModelVarType.LEARNED_RANGE
|
421 |
+
),
|
422 |
+
loss_type=loss_type,
|
423 |
+
rescale_timesteps=rescale_timesteps,
|
424 |
+
)
|
425 |
+
|
426 |
+
|
427 |
+
def add_dict_to_argparser(parser, default_dict):
|
428 |
+
for k, v in default_dict.items():
|
429 |
+
v_type = type(v)
|
430 |
+
if v is None:
|
431 |
+
v_type = str
|
432 |
+
elif isinstance(v, bool):
|
433 |
+
v_type = str2bool
|
434 |
+
parser.add_argument(f"--{k}", default=v, type=v_type)
|
435 |
+
|
436 |
+
|
437 |
+
def args_to_dict(args, keys):
|
438 |
+
return {k: getattr(args, k) for k in keys}
|
439 |
+
|
440 |
+
|
441 |
+
def str2bool(v):
|
442 |
+
"""
|
443 |
+
https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
|
444 |
+
"""
|
445 |
+
if isinstance(v, bool):
|
446 |
+
return v
|
447 |
+
if v.lower() in ("yes", "true", "t", "y", "1"):
|
448 |
+
return True
|
449 |
+
elif v.lower() in ("no", "false", "f", "n", "0"):
|
450 |
+
return False
|
451 |
+
else:
|
452 |
+
raise argparse.ArgumentTypeError("boolean value expected")
|
guided_diffusion/unet.py
ADDED
@@ -0,0 +1,894 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import abstractmethod
|
2 |
+
|
3 |
+
import math
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
import torch as th
|
7 |
+
import torch.nn as nn
|
8 |
+
import torch.nn.functional as F
|
9 |
+
|
10 |
+
from .fp16_util import convert_module_to_f16, convert_module_to_f32
|
11 |
+
from .nn import (
|
12 |
+
checkpoint,
|
13 |
+
conv_nd,
|
14 |
+
linear,
|
15 |
+
avg_pool_nd,
|
16 |
+
zero_module,
|
17 |
+
normalization,
|
18 |
+
timestep_embedding,
|
19 |
+
)
|
20 |
+
|
21 |
+
|
22 |
+
class AttentionPool2d(nn.Module):
|
23 |
+
"""
|
24 |
+
Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
|
25 |
+
"""
|
26 |
+
|
27 |
+
def __init__(
|
28 |
+
self,
|
29 |
+
spacial_dim: int,
|
30 |
+
embed_dim: int,
|
31 |
+
num_heads_channels: int,
|
32 |
+
output_dim: int = None,
|
33 |
+
):
|
34 |
+
super().__init__()
|
35 |
+
self.positional_embedding = nn.Parameter(
|
36 |
+
th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5
|
37 |
+
)
|
38 |
+
self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
|
39 |
+
self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
|
40 |
+
self.num_heads = embed_dim // num_heads_channels
|
41 |
+
self.attention = QKVAttention(self.num_heads)
|
42 |
+
|
43 |
+
def forward(self, x):
|
44 |
+
b, c, *_spatial = x.shape
|
45 |
+
x = x.reshape(b, c, -1) # NC(HW)
|
46 |
+
x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1) # NC(HW+1)
|
47 |
+
x = x + self.positional_embedding[None, :, :].to(x.dtype) # NC(HW+1)
|
48 |
+
x = self.qkv_proj(x)
|
49 |
+
x = self.attention(x)
|
50 |
+
x = self.c_proj(x)
|
51 |
+
return x[:, :, 0]
|
52 |
+
|
53 |
+
|
54 |
+
class TimestepBlock(nn.Module):
|
55 |
+
"""
|
56 |
+
Any module where forward() takes timestep embeddings as a second argument.
|
57 |
+
"""
|
58 |
+
|
59 |
+
@abstractmethod
|
60 |
+
def forward(self, x, emb):
|
61 |
+
"""
|
62 |
+
Apply the module to `x` given `emb` timestep embeddings.
|
63 |
+
"""
|
64 |
+
|
65 |
+
|
66 |
+
class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
|
67 |
+
"""
|
68 |
+
A sequential module that passes timestep embeddings to the children that
|
69 |
+
support it as an extra input.
|
70 |
+
"""
|
71 |
+
|
72 |
+
def forward(self, x, emb):
|
73 |
+
for layer in self:
|
74 |
+
if isinstance(layer, TimestepBlock):
|
75 |
+
x = layer(x, emb)
|
76 |
+
else:
|
77 |
+
x = layer(x)
|
78 |
+
return x
|
79 |
+
|
80 |
+
|
81 |
+
class Upsample(nn.Module):
|
82 |
+
"""
|
83 |
+
An upsampling layer with an optional convolution.
|
84 |
+
|
85 |
+
:param channels: channels in the inputs and outputs.
|
86 |
+
:param use_conv: a bool determining if a convolution is applied.
|
87 |
+
:param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
|
88 |
+
upsampling occurs in the inner-two dimensions.
|
89 |
+
"""
|
90 |
+
|
91 |
+
def __init__(self, channels, use_conv, dims=2, out_channels=None):
|
92 |
+
super().__init__()
|
93 |
+
self.channels = channels
|
94 |
+
self.out_channels = out_channels or channels
|
95 |
+
self.use_conv = use_conv
|
96 |
+
self.dims = dims
|
97 |
+
if use_conv:
|
98 |
+
self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=1)
|
99 |
+
|
100 |
+
def forward(self, x):
|
101 |
+
assert x.shape[1] == self.channels
|
102 |
+
if self.dims == 3:
|
103 |
+
x = F.interpolate(
|
104 |
+
x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
|
105 |
+
)
|
106 |
+
else:
|
107 |
+
x = F.interpolate(x, scale_factor=2, mode="nearest")
|
108 |
+
if self.use_conv:
|
109 |
+
x = self.conv(x)
|
110 |
+
return x
|
111 |
+
|
112 |
+
|
113 |
+
class Downsample(nn.Module):
|
114 |
+
"""
|
115 |
+
A downsampling layer with an optional convolution.
|
116 |
+
|
117 |
+
:param channels: channels in the inputs and outputs.
|
118 |
+
:param use_conv: a bool determining if a convolution is applied.
|
119 |
+
:param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
|
120 |
+
downsampling occurs in the inner-two dimensions.
|
121 |
+
"""
|
122 |
+
|
123 |
+
def __init__(self, channels, use_conv, dims=2, out_channels=None):
|
124 |
+
super().__init__()
|
125 |
+
self.channels = channels
|
126 |
+
self.out_channels = out_channels or channels
|
127 |
+
self.use_conv = use_conv
|
128 |
+
self.dims = dims
|
129 |
+
stride = 2 if dims != 3 else (1, 2, 2)
|
130 |
+
if use_conv:
|
131 |
+
self.op = conv_nd(
|
132 |
+
dims, self.channels, self.out_channels, 3, stride=stride, padding=1
|
133 |
+
)
|
134 |
+
else:
|
135 |
+
assert self.channels == self.out_channels
|
136 |
+
self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
|
137 |
+
|
138 |
+
def forward(self, x):
|
139 |
+
assert x.shape[1] == self.channels
|
140 |
+
return self.op(x)
|
141 |
+
|
142 |
+
|
143 |
+
class ResBlock(TimestepBlock):
|
144 |
+
"""
|
145 |
+
A residual block that can optionally change the number of channels.
|
146 |
+
|
147 |
+
:param channels: the number of input channels.
|
148 |
+
:param emb_channels: the number of timestep embedding channels.
|
149 |
+
:param dropout: the rate of dropout.
|
150 |
+
:param out_channels: if specified, the number of out channels.
|
151 |
+
:param use_conv: if True and out_channels is specified, use a spatial
|
152 |
+
convolution instead of a smaller 1x1 convolution to change the
|
153 |
+
channels in the skip connection.
|
154 |
+
:param dims: determines if the signal is 1D, 2D, or 3D.
|
155 |
+
:param use_checkpoint: if True, use gradient checkpointing on this module.
|
156 |
+
:param up: if True, use this block for upsampling.
|
157 |
+
:param down: if True, use this block for downsampling.
|
158 |
+
"""
|
159 |
+
|
160 |
+
def __init__(
|
161 |
+
self,
|
162 |
+
channels,
|
163 |
+
emb_channels,
|
164 |
+
dropout,
|
165 |
+
out_channels=None,
|
166 |
+
use_conv=False,
|
167 |
+
use_scale_shift_norm=False,
|
168 |
+
dims=2,
|
169 |
+
use_checkpoint=False,
|
170 |
+
up=False,
|
171 |
+
down=False,
|
172 |
+
):
|
173 |
+
super().__init__()
|
174 |
+
self.channels = channels
|
175 |
+
self.emb_channels = emb_channels
|
176 |
+
self.dropout = dropout
|
177 |
+
self.out_channels = out_channels or channels
|
178 |
+
self.use_conv = use_conv
|
179 |
+
self.use_checkpoint = use_checkpoint
|
180 |
+
self.use_scale_shift_norm = use_scale_shift_norm
|
181 |
+
|
182 |
+
self.in_layers = nn.Sequential(
|
183 |
+
normalization(channels),
|
184 |
+
nn.SiLU(),
|
185 |
+
conv_nd(dims, channels, self.out_channels, 3, padding=1),
|
186 |
+
)
|
187 |
+
|
188 |
+
self.updown = up or down
|
189 |
+
|
190 |
+
if up:
|
191 |
+
self.h_upd = Upsample(channels, False, dims)
|
192 |
+
self.x_upd = Upsample(channels, False, dims)
|
193 |
+
elif down:
|
194 |
+
self.h_upd = Downsample(channels, False, dims)
|
195 |
+
self.x_upd = Downsample(channels, False, dims)
|
196 |
+
else:
|
197 |
+
self.h_upd = self.x_upd = nn.Identity()
|
198 |
+
|
199 |
+
self.emb_layers = nn.Sequential(
|
200 |
+
nn.SiLU(),
|
201 |
+
linear(
|
202 |
+
emb_channels,
|
203 |
+
2 * self.out_channels if use_scale_shift_norm else self.out_channels,
|
204 |
+
),
|
205 |
+
)
|
206 |
+
self.out_layers = nn.Sequential(
|
207 |
+
normalization(self.out_channels),
|
208 |
+
nn.SiLU(),
|
209 |
+
nn.Dropout(p=dropout),
|
210 |
+
zero_module(
|
211 |
+
conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
|
212 |
+
),
|
213 |
+
)
|
214 |
+
|
215 |
+
if self.out_channels == channels:
|
216 |
+
self.skip_connection = nn.Identity()
|
217 |
+
elif use_conv:
|
218 |
+
self.skip_connection = conv_nd(
|
219 |
+
dims, channels, self.out_channels, 3, padding=1
|
220 |
+
)
|
221 |
+
else:
|
222 |
+
self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
|
223 |
+
|
224 |
+
def forward(self, x, emb):
|
225 |
+
"""
|
226 |
+
Apply the block to a Tensor, conditioned on a timestep embedding.
|
227 |
+
|
228 |
+
:param x: an [N x C x ...] Tensor of features.
|
229 |
+
:param emb: an [N x emb_channels] Tensor of timestep embeddings.
|
230 |
+
:return: an [N x C x ...] Tensor of outputs.
|
231 |
+
"""
|
232 |
+
return checkpoint(
|
233 |
+
self._forward, (x, emb), self.parameters(), self.use_checkpoint
|
234 |
+
)
|
235 |
+
|
236 |
+
def _forward(self, x, emb):
|
237 |
+
if self.updown:
|
238 |
+
in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
|
239 |
+
h = in_rest(x)
|
240 |
+
h = self.h_upd(h)
|
241 |
+
x = self.x_upd(x)
|
242 |
+
h = in_conv(h)
|
243 |
+
else:
|
244 |
+
h = self.in_layers(x)
|
245 |
+
emb_out = self.emb_layers(emb).type(h.dtype)
|
246 |
+
while len(emb_out.shape) < len(h.shape):
|
247 |
+
emb_out = emb_out[..., None]
|
248 |
+
if self.use_scale_shift_norm:
|
249 |
+
out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
|
250 |
+
scale, shift = th.chunk(emb_out, 2, dim=1)
|
251 |
+
h = out_norm(h) * (1 + scale) + shift
|
252 |
+
h = out_rest(h)
|
253 |
+
else:
|
254 |
+
h = h + emb_out
|
255 |
+
h = self.out_layers(h)
|
256 |
+
return self.skip_connection(x) + h
|
257 |
+
|
258 |
+
|
259 |
+
class AttentionBlock(nn.Module):
|
260 |
+
"""
|
261 |
+
An attention block that allows spatial positions to attend to each other.
|
262 |
+
|
263 |
+
Originally ported from here, but adapted to the N-d case.
|
264 |
+
https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
|
265 |
+
"""
|
266 |
+
|
267 |
+
def __init__(
|
268 |
+
self,
|
269 |
+
channels,
|
270 |
+
num_heads=1,
|
271 |
+
num_head_channels=-1,
|
272 |
+
use_checkpoint=False,
|
273 |
+
use_new_attention_order=False,
|
274 |
+
):
|
275 |
+
super().__init__()
|
276 |
+
self.channels = channels
|
277 |
+
if num_head_channels == -1:
|
278 |
+
self.num_heads = num_heads
|
279 |
+
else:
|
280 |
+
assert (
|
281 |
+
channels % num_head_channels == 0
|
282 |
+
), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
|
283 |
+
self.num_heads = channels // num_head_channels
|
284 |
+
self.use_checkpoint = use_checkpoint
|
285 |
+
self.norm = normalization(channels)
|
286 |
+
self.qkv = conv_nd(1, channels, channels * 3, 1)
|
287 |
+
if use_new_attention_order:
|
288 |
+
# split qkv before split heads
|
289 |
+
self.attention = QKVAttention(self.num_heads)
|
290 |
+
else:
|
291 |
+
# split heads before split qkv
|
292 |
+
self.attention = QKVAttentionLegacy(self.num_heads)
|
293 |
+
|
294 |
+
self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
|
295 |
+
|
296 |
+
def forward(self, x):
|
297 |
+
return checkpoint(self._forward, (x,), self.parameters(), True)
|
298 |
+
|
299 |
+
def _forward(self, x):
|
300 |
+
b, c, *spatial = x.shape
|
301 |
+
x = x.reshape(b, c, -1)
|
302 |
+
qkv = self.qkv(self.norm(x))
|
303 |
+
h = self.attention(qkv)
|
304 |
+
h = self.proj_out(h)
|
305 |
+
return (x + h).reshape(b, c, *spatial)
|
306 |
+
|
307 |
+
|
308 |
+
def count_flops_attn(model, _x, y):
|
309 |
+
"""
|
310 |
+
A counter for the `thop` package to count the operations in an
|
311 |
+
attention operation.
|
312 |
+
Meant to be used like:
|
313 |
+
macs, params = thop.profile(
|
314 |
+
model,
|
315 |
+
inputs=(inputs, timestamps),
|
316 |
+
custom_ops={QKVAttention: QKVAttention.count_flops},
|
317 |
+
)
|
318 |
+
"""
|
319 |
+
b, c, *spatial = y[0].shape
|
320 |
+
num_spatial = int(np.prod(spatial))
|
321 |
+
# We perform two matmuls with the same number of ops.
|
322 |
+
# The first computes the weight matrix, the second computes
|
323 |
+
# the combination of the value vectors.
|
324 |
+
matmul_ops = 2 * b * (num_spatial ** 2) * c
|
325 |
+
model.total_ops += th.DoubleTensor([matmul_ops])
|
326 |
+
|
327 |
+
|
328 |
+
class QKVAttentionLegacy(nn.Module):
|
329 |
+
"""
|
330 |
+
A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
|
331 |
+
"""
|
332 |
+
|
333 |
+
def __init__(self, n_heads):
|
334 |
+
super().__init__()
|
335 |
+
self.n_heads = n_heads
|
336 |
+
|
337 |
+
def forward(self, qkv):
|
338 |
+
"""
|
339 |
+
Apply QKV attention.
|
340 |
+
|
341 |
+
:param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
|
342 |
+
:return: an [N x (H * C) x T] tensor after attention.
|
343 |
+
"""
|
344 |
+
bs, width, length = qkv.shape
|
345 |
+
assert width % (3 * self.n_heads) == 0
|
346 |
+
ch = width // (3 * self.n_heads)
|
347 |
+
q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
|
348 |
+
scale = 1 / math.sqrt(math.sqrt(ch))
|
349 |
+
weight = th.einsum(
|
350 |
+
"bct,bcs->bts", q * scale, k * scale
|
351 |
+
) # More stable with f16 than dividing afterwards
|
352 |
+
weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
|
353 |
+
a = th.einsum("bts,bcs->bct", weight, v)
|
354 |
+
return a.reshape(bs, -1, length)
|
355 |
+
|
356 |
+
@staticmethod
|
357 |
+
def count_flops(model, _x, y):
|
358 |
+
return count_flops_attn(model, _x, y)
|
359 |
+
|
360 |
+
|
361 |
+
class QKVAttention(nn.Module):
|
362 |
+
"""
|
363 |
+
A module which performs QKV attention and splits in a different order.
|
364 |
+
"""
|
365 |
+
|
366 |
+
def __init__(self, n_heads):
|
367 |
+
super().__init__()
|
368 |
+
self.n_heads = n_heads
|
369 |
+
|
370 |
+
def forward(self, qkv):
|
371 |
+
"""
|
372 |
+
Apply QKV attention.
|
373 |
+
|
374 |
+
:param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
|
375 |
+
:return: an [N x (H * C) x T] tensor after attention.
|
376 |
+
"""
|
377 |
+
bs, width, length = qkv.shape
|
378 |
+
assert width % (3 * self.n_heads) == 0
|
379 |
+
ch = width // (3 * self.n_heads)
|
380 |
+
q, k, v = qkv.chunk(3, dim=1)
|
381 |
+
scale = 1 / math.sqrt(math.sqrt(ch))
|
382 |
+
weight = th.einsum(
|
383 |
+
"bct,bcs->bts",
|
384 |
+
(q * scale).view(bs * self.n_heads, ch, length),
|
385 |
+
(k * scale).view(bs * self.n_heads, ch, length),
|
386 |
+
) # More stable with f16 than dividing afterwards
|
387 |
+
weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
|
388 |
+
a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
|
389 |
+
return a.reshape(bs, -1, length)
|
390 |
+
|
391 |
+
@staticmethod
|
392 |
+
def count_flops(model, _x, y):
|
393 |
+
return count_flops_attn(model, _x, y)
|
394 |
+
|
395 |
+
|
396 |
+
class UNetModel(nn.Module):
|
397 |
+
"""
|
398 |
+
The full UNet model with attention and timestep embedding.
|
399 |
+
|
400 |
+
:param in_channels: channels in the input Tensor.
|
401 |
+
:param model_channels: base channel count for the model.
|
402 |
+
:param out_channels: channels in the output Tensor.
|
403 |
+
:param num_res_blocks: number of residual blocks per downsample.
|
404 |
+
:param attention_resolutions: a collection of downsample rates at which
|
405 |
+
attention will take place. May be a set, list, or tuple.
|
406 |
+
For example, if this contains 4, then at 4x downsampling, attention
|
407 |
+
will be used.
|
408 |
+
:param dropout: the dropout probability.
|
409 |
+
:param channel_mult: channel multiplier for each level of the UNet.
|
410 |
+
:param conv_resample: if True, use learned convolutions for upsampling and
|
411 |
+
downsampling.
|
412 |
+
:param dims: determines if the signal is 1D, 2D, or 3D.
|
413 |
+
:param num_classes: if specified (as an int), then this model will be
|
414 |
+
class-conditional with `num_classes` classes.
|
415 |
+
:param use_checkpoint: use gradient checkpointing to reduce memory usage.
|
416 |
+
:param num_heads: the number of attention heads in each attention layer.
|
417 |
+
:param num_heads_channels: if specified, ignore num_heads and instead use
|
418 |
+
a fixed channel width per attention head.
|
419 |
+
:param num_heads_upsample: works with num_heads to set a different number
|
420 |
+
of heads for upsampling. Deprecated.
|
421 |
+
:param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
|
422 |
+
:param resblock_updown: use residual blocks for up/downsampling.
|
423 |
+
:param use_new_attention_order: use a different attention pattern for potentially
|
424 |
+
increased efficiency.
|
425 |
+
"""
|
426 |
+
|
427 |
+
def __init__(
|
428 |
+
self,
|
429 |
+
image_size,
|
430 |
+
in_channels,
|
431 |
+
model_channels,
|
432 |
+
out_channels,
|
433 |
+
num_res_blocks,
|
434 |
+
attention_resolutions,
|
435 |
+
dropout=0,
|
436 |
+
channel_mult=(1, 2, 4, 8),
|
437 |
+
conv_resample=True,
|
438 |
+
dims=2,
|
439 |
+
num_classes=None,
|
440 |
+
use_checkpoint=False,
|
441 |
+
use_fp16=False,
|
442 |
+
num_heads=1,
|
443 |
+
num_head_channels=-1,
|
444 |
+
num_heads_upsample=-1,
|
445 |
+
use_scale_shift_norm=False,
|
446 |
+
resblock_updown=False,
|
447 |
+
use_new_attention_order=False,
|
448 |
+
):
|
449 |
+
super().__init__()
|
450 |
+
|
451 |
+
if num_heads_upsample == -1:
|
452 |
+
num_heads_upsample = num_heads
|
453 |
+
|
454 |
+
self.image_size = image_size
|
455 |
+
self.in_channels = in_channels
|
456 |
+
self.model_channels = model_channels
|
457 |
+
self.out_channels = out_channels
|
458 |
+
self.num_res_blocks = num_res_blocks
|
459 |
+
self.attention_resolutions = attention_resolutions
|
460 |
+
self.dropout = dropout
|
461 |
+
self.channel_mult = channel_mult
|
462 |
+
self.conv_resample = conv_resample
|
463 |
+
self.num_classes = num_classes
|
464 |
+
self.use_checkpoint = use_checkpoint
|
465 |
+
self.dtype = th.float16 if use_fp16 else th.float32
|
466 |
+
self.num_heads = num_heads
|
467 |
+
self.num_head_channels = num_head_channels
|
468 |
+
self.num_heads_upsample = num_heads_upsample
|
469 |
+
|
470 |
+
time_embed_dim = model_channels * 4
|
471 |
+
self.time_embed = nn.Sequential(
|
472 |
+
linear(model_channels, time_embed_dim),
|
473 |
+
nn.SiLU(),
|
474 |
+
linear(time_embed_dim, time_embed_dim),
|
475 |
+
)
|
476 |
+
|
477 |
+
if self.num_classes is not None:
|
478 |
+
self.label_emb = nn.Embedding(num_classes, time_embed_dim)
|
479 |
+
|
480 |
+
ch = input_ch = int(channel_mult[0] * model_channels)
|
481 |
+
self.input_blocks = nn.ModuleList(
|
482 |
+
[TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))]
|
483 |
+
)
|
484 |
+
self._feature_size = ch
|
485 |
+
input_block_chans = [ch]
|
486 |
+
ds = 1
|
487 |
+
for level, mult in enumerate(channel_mult):
|
488 |
+
for _ in range(num_res_blocks):
|
489 |
+
layers = [
|
490 |
+
ResBlock(
|
491 |
+
ch,
|
492 |
+
time_embed_dim,
|
493 |
+
dropout,
|
494 |
+
out_channels=int(mult * model_channels),
|
495 |
+
dims=dims,
|
496 |
+
use_checkpoint=use_checkpoint,
|
497 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
498 |
+
)
|
499 |
+
]
|
500 |
+
ch = int(mult * model_channels)
|
501 |
+
if ds in attention_resolutions:
|
502 |
+
layers.append(
|
503 |
+
AttentionBlock(
|
504 |
+
ch,
|
505 |
+
use_checkpoint=use_checkpoint,
|
506 |
+
num_heads=num_heads,
|
507 |
+
num_head_channels=num_head_channels,
|
508 |
+
use_new_attention_order=use_new_attention_order,
|
509 |
+
)
|
510 |
+
)
|
511 |
+
self.input_blocks.append(TimestepEmbedSequential(*layers))
|
512 |
+
self._feature_size += ch
|
513 |
+
input_block_chans.append(ch)
|
514 |
+
if level != len(channel_mult) - 1:
|
515 |
+
out_ch = ch
|
516 |
+
self.input_blocks.append(
|
517 |
+
TimestepEmbedSequential(
|
518 |
+
ResBlock(
|
519 |
+
ch,
|
520 |
+
time_embed_dim,
|
521 |
+
dropout,
|
522 |
+
out_channels=out_ch,
|
523 |
+
dims=dims,
|
524 |
+
use_checkpoint=use_checkpoint,
|
525 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
526 |
+
down=True,
|
527 |
+
)
|
528 |
+
if resblock_updown
|
529 |
+
else Downsample(
|
530 |
+
ch, conv_resample, dims=dims, out_channels=out_ch
|
531 |
+
)
|
532 |
+
)
|
533 |
+
)
|
534 |
+
ch = out_ch
|
535 |
+
input_block_chans.append(ch)
|
536 |
+
ds *= 2
|
537 |
+
self._feature_size += ch
|
538 |
+
|
539 |
+
self.middle_block = TimestepEmbedSequential(
|
540 |
+
ResBlock(
|
541 |
+
ch,
|
542 |
+
time_embed_dim,
|
543 |
+
dropout,
|
544 |
+
dims=dims,
|
545 |
+
use_checkpoint=use_checkpoint,
|
546 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
547 |
+
),
|
548 |
+
AttentionBlock(
|
549 |
+
ch,
|
550 |
+
use_checkpoint=use_checkpoint,
|
551 |
+
num_heads=num_heads,
|
552 |
+
num_head_channels=num_head_channels,
|
553 |
+
use_new_attention_order=use_new_attention_order,
|
554 |
+
),
|
555 |
+
ResBlock(
|
556 |
+
ch,
|
557 |
+
time_embed_dim,
|
558 |
+
dropout,
|
559 |
+
dims=dims,
|
560 |
+
use_checkpoint=use_checkpoint,
|
561 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
562 |
+
),
|
563 |
+
)
|
564 |
+
self._feature_size += ch
|
565 |
+
|
566 |
+
self.output_blocks = nn.ModuleList([])
|
567 |
+
for level, mult in list(enumerate(channel_mult))[::-1]:
|
568 |
+
for i in range(num_res_blocks + 1):
|
569 |
+
ich = input_block_chans.pop()
|
570 |
+
layers = [
|
571 |
+
ResBlock(
|
572 |
+
ch + ich,
|
573 |
+
time_embed_dim,
|
574 |
+
dropout,
|
575 |
+
out_channels=int(model_channels * mult),
|
576 |
+
dims=dims,
|
577 |
+
use_checkpoint=use_checkpoint,
|
578 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
579 |
+
)
|
580 |
+
]
|
581 |
+
ch = int(model_channels * mult)
|
582 |
+
if ds in attention_resolutions:
|
583 |
+
layers.append(
|
584 |
+
AttentionBlock(
|
585 |
+
ch,
|
586 |
+
use_checkpoint=use_checkpoint,
|
587 |
+
num_heads=num_heads_upsample,
|
588 |
+
num_head_channels=num_head_channels,
|
589 |
+
use_new_attention_order=use_new_attention_order,
|
590 |
+
)
|
591 |
+
)
|
592 |
+
if level and i == num_res_blocks:
|
593 |
+
out_ch = ch
|
594 |
+
layers.append(
|
595 |
+
ResBlock(
|
596 |
+
ch,
|
597 |
+
time_embed_dim,
|
598 |
+
dropout,
|
599 |
+
out_channels=out_ch,
|
600 |
+
dims=dims,
|
601 |
+
use_checkpoint=use_checkpoint,
|
602 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
603 |
+
up=True,
|
604 |
+
)
|
605 |
+
if resblock_updown
|
606 |
+
else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
|
607 |
+
)
|
608 |
+
ds //= 2
|
609 |
+
self.output_blocks.append(TimestepEmbedSequential(*layers))
|
610 |
+
self._feature_size += ch
|
611 |
+
|
612 |
+
self.out = nn.Sequential(
|
613 |
+
normalization(ch),
|
614 |
+
nn.SiLU(),
|
615 |
+
zero_module(conv_nd(dims, input_ch, out_channels, 3, padding=1)),
|
616 |
+
)
|
617 |
+
|
618 |
+
def convert_to_fp16(self):
|
619 |
+
"""
|
620 |
+
Convert the torso of the model to float16.
|
621 |
+
"""
|
622 |
+
self.input_blocks.apply(convert_module_to_f16)
|
623 |
+
self.middle_block.apply(convert_module_to_f16)
|
624 |
+
self.output_blocks.apply(convert_module_to_f16)
|
625 |
+
|
626 |
+
def convert_to_fp32(self):
|
627 |
+
"""
|
628 |
+
Convert the torso of the model to float32.
|
629 |
+
"""
|
630 |
+
self.input_blocks.apply(convert_module_to_f32)
|
631 |
+
self.middle_block.apply(convert_module_to_f32)
|
632 |
+
self.output_blocks.apply(convert_module_to_f32)
|
633 |
+
|
634 |
+
def forward(self, x, timesteps, y=None):
|
635 |
+
"""
|
636 |
+
Apply the model to an input batch.
|
637 |
+
|
638 |
+
:param x: an [N x C x ...] Tensor of inputs.
|
639 |
+
:param timesteps: a 1-D batch of timesteps.
|
640 |
+
:param y: an [N] Tensor of labels, if class-conditional.
|
641 |
+
:return: an [N x C x ...] Tensor of outputs.
|
642 |
+
"""
|
643 |
+
assert (y is not None) == (
|
644 |
+
self.num_classes is not None
|
645 |
+
), "must specify y if and only if the model is class-conditional"
|
646 |
+
|
647 |
+
hs = []
|
648 |
+
emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
|
649 |
+
|
650 |
+
if self.num_classes is not None:
|
651 |
+
assert y.shape == (x.shape[0],)
|
652 |
+
emb = emb + self.label_emb(y)
|
653 |
+
|
654 |
+
h = x.type(self.dtype)
|
655 |
+
for module in self.input_blocks:
|
656 |
+
h = module(h, emb)
|
657 |
+
hs.append(h)
|
658 |
+
h = self.middle_block(h, emb)
|
659 |
+
for module in self.output_blocks:
|
660 |
+
h = th.cat([h, hs.pop()], dim=1)
|
661 |
+
h = module(h, emb)
|
662 |
+
h = h.type(x.dtype)
|
663 |
+
return self.out(h)
|
664 |
+
|
665 |
+
|
666 |
+
class SuperResModel(UNetModel):
|
667 |
+
"""
|
668 |
+
A UNetModel that performs super-resolution.
|
669 |
+
|
670 |
+
Expects an extra kwarg `low_res` to condition on a low-resolution image.
|
671 |
+
"""
|
672 |
+
|
673 |
+
def __init__(self, image_size, in_channels, *args, **kwargs):
|
674 |
+
super().__init__(image_size, in_channels * 2, *args, **kwargs)
|
675 |
+
|
676 |
+
def forward(self, x, timesteps, low_res=None, **kwargs):
|
677 |
+
_, _, new_height, new_width = x.shape
|
678 |
+
upsampled = F.interpolate(low_res, (new_height, new_width), mode="bilinear")
|
679 |
+
x = th.cat([x, upsampled], dim=1)
|
680 |
+
return super().forward(x, timesteps, **kwargs)
|
681 |
+
|
682 |
+
|
683 |
+
class EncoderUNetModel(nn.Module):
|
684 |
+
"""
|
685 |
+
The half UNet model with attention and timestep embedding.
|
686 |
+
|
687 |
+
For usage, see UNet.
|
688 |
+
"""
|
689 |
+
|
690 |
+
def __init__(
|
691 |
+
self,
|
692 |
+
image_size,
|
693 |
+
in_channels,
|
694 |
+
model_channels,
|
695 |
+
out_channels,
|
696 |
+
num_res_blocks,
|
697 |
+
attention_resolutions,
|
698 |
+
dropout=0,
|
699 |
+
channel_mult=(1, 2, 4, 8),
|
700 |
+
conv_resample=True,
|
701 |
+
dims=2,
|
702 |
+
use_checkpoint=False,
|
703 |
+
use_fp16=False,
|
704 |
+
num_heads=1,
|
705 |
+
num_head_channels=-1,
|
706 |
+
num_heads_upsample=-1,
|
707 |
+
use_scale_shift_norm=False,
|
708 |
+
resblock_updown=False,
|
709 |
+
use_new_attention_order=False,
|
710 |
+
pool="adaptive",
|
711 |
+
):
|
712 |
+
super().__init__()
|
713 |
+
|
714 |
+
if num_heads_upsample == -1:
|
715 |
+
num_heads_upsample = num_heads
|
716 |
+
|
717 |
+
self.in_channels = in_channels
|
718 |
+
self.model_channels = model_channels
|
719 |
+
self.out_channels = out_channels
|
720 |
+
self.num_res_blocks = num_res_blocks
|
721 |
+
self.attention_resolutions = attention_resolutions
|
722 |
+
self.dropout = dropout
|
723 |
+
self.channel_mult = channel_mult
|
724 |
+
self.conv_resample = conv_resample
|
725 |
+
self.use_checkpoint = use_checkpoint
|
726 |
+
self.dtype = th.float16 if use_fp16 else th.float32
|
727 |
+
self.num_heads = num_heads
|
728 |
+
self.num_head_channels = num_head_channels
|
729 |
+
self.num_heads_upsample = num_heads_upsample
|
730 |
+
|
731 |
+
time_embed_dim = model_channels * 4
|
732 |
+
self.time_embed = nn.Sequential(
|
733 |
+
linear(model_channels, time_embed_dim),
|
734 |
+
nn.SiLU(),
|
735 |
+
linear(time_embed_dim, time_embed_dim),
|
736 |
+
)
|
737 |
+
|
738 |
+
ch = int(channel_mult[0] * model_channels)
|
739 |
+
self.input_blocks = nn.ModuleList(
|
740 |
+
[TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))]
|
741 |
+
)
|
742 |
+
self._feature_size = ch
|
743 |
+
input_block_chans = [ch]
|
744 |
+
ds = 1
|
745 |
+
for level, mult in enumerate(channel_mult):
|
746 |
+
for _ in range(num_res_blocks):
|
747 |
+
layers = [
|
748 |
+
ResBlock(
|
749 |
+
ch,
|
750 |
+
time_embed_dim,
|
751 |
+
dropout,
|
752 |
+
out_channels=int(mult * model_channels),
|
753 |
+
dims=dims,
|
754 |
+
use_checkpoint=use_checkpoint,
|
755 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
756 |
+
)
|
757 |
+
]
|
758 |
+
ch = int(mult * model_channels)
|
759 |
+
if ds in attention_resolutions:
|
760 |
+
layers.append(
|
761 |
+
AttentionBlock(
|
762 |
+
ch,
|
763 |
+
use_checkpoint=use_checkpoint,
|
764 |
+
num_heads=num_heads,
|
765 |
+
num_head_channels=num_head_channels,
|
766 |
+
use_new_attention_order=use_new_attention_order,
|
767 |
+
)
|
768 |
+
)
|
769 |
+
self.input_blocks.append(TimestepEmbedSequential(*layers))
|
770 |
+
self._feature_size += ch
|
771 |
+
input_block_chans.append(ch)
|
772 |
+
if level != len(channel_mult) - 1:
|
773 |
+
out_ch = ch
|
774 |
+
self.input_blocks.append(
|
775 |
+
TimestepEmbedSequential(
|
776 |
+
ResBlock(
|
777 |
+
ch,
|
778 |
+
time_embed_dim,
|
779 |
+
dropout,
|
780 |
+
out_channels=out_ch,
|
781 |
+
dims=dims,
|
782 |
+
use_checkpoint=use_checkpoint,
|
783 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
784 |
+
down=True,
|
785 |
+
)
|
786 |
+
if resblock_updown
|
787 |
+
else Downsample(
|
788 |
+
ch, conv_resample, dims=dims, out_channels=out_ch
|
789 |
+
)
|
790 |
+
)
|
791 |
+
)
|
792 |
+
ch = out_ch
|
793 |
+
input_block_chans.append(ch)
|
794 |
+
ds *= 2
|
795 |
+
self._feature_size += ch
|
796 |
+
|
797 |
+
self.middle_block = TimestepEmbedSequential(
|
798 |
+
ResBlock(
|
799 |
+
ch,
|
800 |
+
time_embed_dim,
|
801 |
+
dropout,
|
802 |
+
dims=dims,
|
803 |
+
use_checkpoint=use_checkpoint,
|
804 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
805 |
+
),
|
806 |
+
AttentionBlock(
|
807 |
+
ch,
|
808 |
+
use_checkpoint=use_checkpoint,
|
809 |
+
num_heads=num_heads,
|
810 |
+
num_head_channels=num_head_channels,
|
811 |
+
use_new_attention_order=use_new_attention_order,
|
812 |
+
),
|
813 |
+
ResBlock(
|
814 |
+
ch,
|
815 |
+
time_embed_dim,
|
816 |
+
dropout,
|
817 |
+
dims=dims,
|
818 |
+
use_checkpoint=use_checkpoint,
|
819 |
+
use_scale_shift_norm=use_scale_shift_norm,
|
820 |
+
),
|
821 |
+
)
|
822 |
+
self._feature_size += ch
|
823 |
+
self.pool = pool
|
824 |
+
if pool == "adaptive":
|
825 |
+
self.out = nn.Sequential(
|
826 |
+
normalization(ch),
|
827 |
+
nn.SiLU(),
|
828 |
+
nn.AdaptiveAvgPool2d((1, 1)),
|
829 |
+
zero_module(conv_nd(dims, ch, out_channels, 1)),
|
830 |
+
nn.Flatten(),
|
831 |
+
)
|
832 |
+
elif pool == "attention":
|
833 |
+
assert num_head_channels != -1
|
834 |
+
self.out = nn.Sequential(
|
835 |
+
normalization(ch),
|
836 |
+
nn.SiLU(),
|
837 |
+
AttentionPool2d(
|
838 |
+
(image_size // ds), ch, num_head_channels, out_channels
|
839 |
+
),
|
840 |
+
)
|
841 |
+
elif pool == "spatial":
|
842 |
+
self.out = nn.Sequential(
|
843 |
+
nn.Linear(self._feature_size, 2048),
|
844 |
+
nn.ReLU(),
|
845 |
+
nn.Linear(2048, self.out_channels),
|
846 |
+
)
|
847 |
+
elif pool == "spatial_v2":
|
848 |
+
self.out = nn.Sequential(
|
849 |
+
nn.Linear(self._feature_size, 2048),
|
850 |
+
normalization(2048),
|
851 |
+
nn.SiLU(),
|
852 |
+
nn.Linear(2048, self.out_channels),
|
853 |
+
)
|
854 |
+
else:
|
855 |
+
raise NotImplementedError(f"Unexpected {pool} pooling")
|
856 |
+
|
857 |
+
def convert_to_fp16(self):
|
858 |
+
"""
|
859 |
+
Convert the torso of the model to float16.
|
860 |
+
"""
|
861 |
+
self.input_blocks.apply(convert_module_to_f16)
|
862 |
+
self.middle_block.apply(convert_module_to_f16)
|
863 |
+
|
864 |
+
def convert_to_fp32(self):
|
865 |
+
"""
|
866 |
+
Convert the torso of the model to float32.
|
867 |
+
"""
|
868 |
+
self.input_blocks.apply(convert_module_to_f32)
|
869 |
+
self.middle_block.apply(convert_module_to_f32)
|
870 |
+
|
871 |
+
def forward(self, x, timesteps):
|
872 |
+
"""
|
873 |
+
Apply the model to an input batch.
|
874 |
+
|
875 |
+
:param x: an [N x C x ...] Tensor of inputs.
|
876 |
+
:param timesteps: a 1-D batch of timesteps.
|
877 |
+
:return: an [N x K] Tensor of outputs.
|
878 |
+
"""
|
879 |
+
emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
|
880 |
+
|
881 |
+
results = []
|
882 |
+
h = x.type(self.dtype)
|
883 |
+
for module in self.input_blocks:
|
884 |
+
h = module(h, emb)
|
885 |
+
if self.pool.startswith("spatial"):
|
886 |
+
results.append(h.type(x.dtype).mean(dim=(2, 3)))
|
887 |
+
h = self.middle_block(h, emb)
|
888 |
+
if self.pool.startswith("spatial"):
|
889 |
+
results.append(h.type(x.dtype).mean(dim=(2, 3)))
|
890 |
+
h = th.cat(results, axis=-1)
|
891 |
+
return self.out(h)
|
892 |
+
else:
|
893 |
+
h = h.type(x.dtype)
|
894 |
+
return self.out(h)
|
misc.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
|
4 |
+
|
5 |
+
def torch_samps_to_imgs(imgs, uncenter=True):
|
6 |
+
if uncenter:
|
7 |
+
imgs = (imgs + 1) / 2 # [-1, 1] -> [0, 1]
|
8 |
+
imgs = (imgs * 255).clamp(0, 255)
|
9 |
+
imgs = imgs.to(torch.uint8)
|
10 |
+
imgs = imgs.permute(0, 2, 3, 1)
|
11 |
+
imgs = imgs.cpu().numpy()
|
12 |
+
return imgs
|
13 |
+
|
14 |
+
|
15 |
+
def imgs_to_torch(imgs):
|
16 |
+
assert imgs.dtype == np.uint8
|
17 |
+
assert len(imgs.shape) == 4 and imgs.shape[-1] == 3, "expect (N, H, W, C)"
|
18 |
+
_, H, W, _ = imgs.shape
|
19 |
+
|
20 |
+
imgs = imgs.transpose(0, 3, 1, 2)
|
21 |
+
imgs = (imgs / 255).astype(np.float32)
|
22 |
+
imgs = (imgs * 2) - 1
|
23 |
+
imgs = torch.as_tensor(imgs)
|
24 |
+
H, W = [_l - (_l % 32) for _l in (H, W)]
|
25 |
+
imgs = torch.nn.functional.interpolate(imgs, (H, W), mode="bilinear")
|
26 |
+
return imgs
|
27 |
+
|
28 |
+
|
29 |
+
def test_encode_decode():
|
30 |
+
import imageio
|
31 |
+
from run_img_sampling import ScoreAdapter, SD
|
32 |
+
from vis import _draw
|
33 |
+
|
34 |
+
fname = "~/clean.png"
|
35 |
+
raw = imageio.imread(fname)
|
36 |
+
raw = imgs_to_torch(raw[np.newaxis, ...])
|
37 |
+
|
38 |
+
model: ScoreAdapter = SD().run()
|
39 |
+
raw = raw.to(model.device)
|
40 |
+
zs = model.encode(raw)
|
41 |
+
img = model.decode(zs)
|
42 |
+
img = torch_samps_to_imgs(img)
|
43 |
+
_draw(
|
44 |
+
[imageio.imread(fname), img.squeeze(0)],
|
45 |
+
)
|
46 |
+
|
47 |
+
|
48 |
+
def test():
|
49 |
+
test_encode_decode()
|
50 |
+
|
51 |
+
|
52 |
+
if __name__ == "__main__":
|
53 |
+
test()
|
my/README.md
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
a personal tookit for experiment management;
|
2 |
+
some of the designs patterns are inspired by detectron2
|
my/__init__.py
ADDED
File without changes
|
my/config.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List, Union
|
2 |
+
from copy import deepcopy
|
3 |
+
from collections import namedtuple
|
4 |
+
from pathlib import Path
|
5 |
+
import argparse
|
6 |
+
from argparse import RawDescriptionHelpFormatter
|
7 |
+
import yaml
|
8 |
+
from pydantic import BaseModel as _Base
|
9 |
+
|
10 |
+
|
11 |
+
class BaseConf(_Base):
|
12 |
+
class Config:
|
13 |
+
validate_all = True
|
14 |
+
allow_mutation = True
|
15 |
+
extra = "ignore"
|
16 |
+
|
17 |
+
|
18 |
+
def SingleOrList(inner_type):
|
19 |
+
return Union[inner_type, List[inner_type]]
|
20 |
+
|
21 |
+
|
22 |
+
def optional_load_config(fname="config.yml"):
|
23 |
+
cfg = {}
|
24 |
+
conf_fname = Path.cwd() / fname
|
25 |
+
if conf_fname.is_file():
|
26 |
+
with conf_fname.open("r") as f:
|
27 |
+
raw = f.read()
|
28 |
+
print("loaded config\n ")
|
29 |
+
print(raw) # yaml raw itself is well formatted
|
30 |
+
cfg = yaml.safe_load(raw)
|
31 |
+
return cfg
|
32 |
+
|
33 |
+
|
34 |
+
def write_full_config(cfg_obj, fname="full_config.yml"):
|
35 |
+
cfg = cfg_obj.dict()
|
36 |
+
cfg = _dict_to_yaml(cfg)
|
37 |
+
print(f"\n--- full config ---\n\n{cfg}\n")
|
38 |
+
with (Path.cwd() / fname).open("w") as f:
|
39 |
+
f.write(cfg)
|
40 |
+
|
41 |
+
|
42 |
+
def argparse_cfg_template(curr_cfgs):
|
43 |
+
parser = argparse.ArgumentParser(
|
44 |
+
description='Manual spec of configs',
|
45 |
+
epilog=f'curr cfgs:\n\n{_dict_to_yaml(curr_cfgs)}',
|
46 |
+
formatter_class=RawDescriptionHelpFormatter
|
47 |
+
)
|
48 |
+
_, args = parser.parse_known_args()
|
49 |
+
clauses = []
|
50 |
+
for i in range(0, len(args), 2):
|
51 |
+
assert args[i][:2] == "--", "please start args with --"
|
52 |
+
clauses.append({args[i][2:]: args[i+1]})
|
53 |
+
print(f"cmdline clauses: {clauses}")
|
54 |
+
|
55 |
+
maker = ConfigMaker(curr_cfgs)
|
56 |
+
for clu in clauses:
|
57 |
+
maker.execute_clause(clu)
|
58 |
+
|
59 |
+
final = maker.state.copy()
|
60 |
+
return final
|
61 |
+
|
62 |
+
|
63 |
+
def _dict_to_yaml(arg):
|
64 |
+
return yaml.safe_dump(arg, sort_keys=False, allow_unicode=True)
|
65 |
+
|
66 |
+
|
67 |
+
def dispatch(module):
|
68 |
+
cfg = optional_load_config()
|
69 |
+
cfg = module(**cfg).dict()
|
70 |
+
|
71 |
+
cfg = argparse_cfg_template(cfg) # cmdline takes priority
|
72 |
+
mod = module(**cfg)
|
73 |
+
|
74 |
+
write_full_config(mod)
|
75 |
+
|
76 |
+
mod.run()
|
77 |
+
|
78 |
+
|
79 |
+
# below are some support tools
|
80 |
+
|
81 |
+
|
82 |
+
class ConfigMaker():
|
83 |
+
CMD = namedtuple('cmd', field_names=['sub', 'verb', 'objs'])
|
84 |
+
VERBS = ('add', 'replace', 'del')
|
85 |
+
|
86 |
+
def __init__(self, base_node):
|
87 |
+
self.state = base_node
|
88 |
+
self.clauses = []
|
89 |
+
|
90 |
+
def clone(self):
|
91 |
+
return deepcopy(self)
|
92 |
+
|
93 |
+
def execute_clause(self, raw_clause):
|
94 |
+
cls = self.__class__
|
95 |
+
assert isinstance(raw_clause, (str, dict))
|
96 |
+
if isinstance(raw_clause, dict):
|
97 |
+
assert len(raw_clause) == 1, \
|
98 |
+
"a clause can only have 1 statement: {} clauses in {}".format(
|
99 |
+
len(raw_clause), raw_clause
|
100 |
+
)
|
101 |
+
cmd = list(raw_clause.keys())[0]
|
102 |
+
arg = raw_clause[cmd]
|
103 |
+
else:
|
104 |
+
cmd = raw_clause
|
105 |
+
arg = None
|
106 |
+
cmd = self.parse_clause_cmd(cmd)
|
107 |
+
tracer = NodeTracer(self.state)
|
108 |
+
tracer.advance_pointer(path=cmd.sub)
|
109 |
+
if cmd.verb == cls.VERBS[0]:
|
110 |
+
tracer.add(cmd.objs, arg)
|
111 |
+
elif cmd.verb == cls.VERBS[1]:
|
112 |
+
tracer.replace(cmd.objs, arg)
|
113 |
+
elif cmd.verb == cls.VERBS[2]:
|
114 |
+
assert isinstance(raw_clause, str)
|
115 |
+
tracer.delete(cmd.objs)
|
116 |
+
self.state = tracer.state
|
117 |
+
|
118 |
+
@classmethod
|
119 |
+
def parse_clause_cmd(cls, input):
|
120 |
+
"""
|
121 |
+
Args:
|
122 |
+
input: a string to be parsed
|
123 |
+
1. First test whether a verb is present
|
124 |
+
2. If not present, then str is a single subject, and verb is replace
|
125 |
+
This is a syntactical sugar that makes writing config easy
|
126 |
+
3. If a verb is found, whatever comes before is a subject, and after the
|
127 |
+
objects.
|
128 |
+
4. Handle the edge cases properly. Below are expected parse outputs
|
129 |
+
input sub verb obj
|
130 |
+
--- No verb
|
131 |
+
'' '' replace []
|
132 |
+
'a.b' 'a.b' replace []
|
133 |
+
'add' '' add []
|
134 |
+
'P Q' err: 2 subjects
|
135 |
+
--- Verb present
|
136 |
+
'T add' 'T' add []
|
137 |
+
'T del a b' 'T' del [a, b]
|
138 |
+
'P Q add a' err: 2 subjects
|
139 |
+
'P add del b' err: 2 verbs
|
140 |
+
"""
|
141 |
+
assert isinstance(input, str)
|
142 |
+
input = input.split()
|
143 |
+
objs = []
|
144 |
+
sub = ''
|
145 |
+
verb, verb_inx = cls.scan_for_verb(input)
|
146 |
+
if verb is None:
|
147 |
+
assert len(input) <= 1, "no verb present; more than 1 subject: {}"\
|
148 |
+
.format(input)
|
149 |
+
sub = input[0] if len(input) == 1 else ''
|
150 |
+
verb = cls.VERBS[1]
|
151 |
+
else:
|
152 |
+
assert not verb_inx > 1, 'verb {} at inx {}; more than 1 subject in: {}'\
|
153 |
+
.format(verb, verb_inx, input)
|
154 |
+
sub = input[0] if verb_inx == 1 else ''
|
155 |
+
objs = input[verb_inx + 1:]
|
156 |
+
cmd = cls.CMD(sub=sub, verb=verb, objs=objs)
|
157 |
+
return cmd
|
158 |
+
|
159 |
+
@classmethod
|
160 |
+
def scan_for_verb(cls, input_list):
|
161 |
+
assert isinstance(input_list, list)
|
162 |
+
counts = [ input_list.count(v) for v in cls.VERBS ]
|
163 |
+
presence = [ cnt > 0 for cnt in counts ]
|
164 |
+
if sum(presence) == 0:
|
165 |
+
return None, -1
|
166 |
+
elif sum(presence) > 1:
|
167 |
+
raise ValueError("multiple verbs discovered in {}".format(input_list))
|
168 |
+
|
169 |
+
if max(counts) > 1:
|
170 |
+
raise ValueError("verbs repeated in cmd: {}".format(input_list))
|
171 |
+
# by now, there is 1 verb that has occured exactly 1 time
|
172 |
+
verb = cls.VERBS[presence.index(1)]
|
173 |
+
inx = input_list.index(verb)
|
174 |
+
return verb, inx
|
175 |
+
|
176 |
+
|
177 |
+
class NodeTracer():
|
178 |
+
def __init__(self, src_node):
|
179 |
+
"""
|
180 |
+
A src node can be either a list or dict
|
181 |
+
"""
|
182 |
+
assert isinstance(src_node, (list, dict))
|
183 |
+
|
184 |
+
# these are movable pointers
|
185 |
+
self.child_token = "_" # init token can be anything
|
186 |
+
self.parent = {self.child_token: src_node}
|
187 |
+
|
188 |
+
# these are permanent pointers at the root
|
189 |
+
self.root_child_token = self.child_token
|
190 |
+
self.root = self.parent
|
191 |
+
|
192 |
+
@property
|
193 |
+
def state(self):
|
194 |
+
return self.root[self.root_child_token]
|
195 |
+
|
196 |
+
@property
|
197 |
+
def pointed(self):
|
198 |
+
return self.parent[self.child_token]
|
199 |
+
|
200 |
+
def advance_pointer(self, path):
|
201 |
+
if len(path) == 0:
|
202 |
+
return
|
203 |
+
path_list = list(
|
204 |
+
map(lambda x: int(x) if str.isdigit(x) else x, path.split('.'))
|
205 |
+
)
|
206 |
+
|
207 |
+
for i, token in enumerate(path_list):
|
208 |
+
self.parent = self.pointed
|
209 |
+
self.child_token = token
|
210 |
+
try:
|
211 |
+
self.pointed
|
212 |
+
except (IndexError, KeyError):
|
213 |
+
raise ValueError(
|
214 |
+
"During the tracing of {}, {}-th token '{}'"
|
215 |
+
" is not present in node {}".format(
|
216 |
+
path, i, self.child_token, self.state
|
217 |
+
)
|
218 |
+
)
|
219 |
+
|
220 |
+
def replace(self, objs, arg):
|
221 |
+
assert len(objs) == 0
|
222 |
+
val_type = type(self.parent[self.child_token])
|
223 |
+
# this is such an unfortunate hack
|
224 |
+
# turn everything to string, so that eval could work
|
225 |
+
# some of the clauses come from cmdline, some from yaml files for sow.
|
226 |
+
arg = str(arg)
|
227 |
+
if val_type == str:
|
228 |
+
pass
|
229 |
+
else:
|
230 |
+
arg = eval(arg)
|
231 |
+
assert type(arg) == val_type, \
|
232 |
+
f"require {val_type.__name__}, given {type(arg).__name__}"
|
233 |
+
|
234 |
+
self.parent[self.child_token] = arg
|
my/registry.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from detectron2
|
2 |
+
from typing import Any, Dict, Iterable, Iterator, Tuple
|
3 |
+
from tabulate import tabulate
|
4 |
+
|
5 |
+
|
6 |
+
class Registry(Iterable[Tuple[str, Any]]):
|
7 |
+
def __init__(self, name: str) -> None:
|
8 |
+
"""
|
9 |
+
Args:
|
10 |
+
name (str): the name of this registry
|
11 |
+
"""
|
12 |
+
self._name: str = name
|
13 |
+
self._obj_map: Dict[str, Any] = {}
|
14 |
+
|
15 |
+
def _do_register(self, name: str, obj: Any) -> None:
|
16 |
+
assert (
|
17 |
+
name not in self._obj_map
|
18 |
+
), "An object named '{}' was already registered in '{}' registry!".format(
|
19 |
+
name, self._name
|
20 |
+
)
|
21 |
+
self._obj_map[name] = obj
|
22 |
+
|
23 |
+
def register(self, obj: Any = None) -> Any:
|
24 |
+
"""
|
25 |
+
Register the given object under the the name `obj.__name__`.
|
26 |
+
Can be used as either a decorator or not. See docstring of this class for usage.
|
27 |
+
"""
|
28 |
+
if obj is None:
|
29 |
+
# used as a decorator
|
30 |
+
def deco(func_or_class: Any) -> Any:
|
31 |
+
name = func_or_class.__name__
|
32 |
+
self._do_register(name, func_or_class)
|
33 |
+
return func_or_class
|
34 |
+
|
35 |
+
return deco
|
36 |
+
|
37 |
+
# used as a function call
|
38 |
+
name = obj.__name__
|
39 |
+
self._do_register(name, obj)
|
40 |
+
|
41 |
+
def get(self, name: str) -> Any:
|
42 |
+
ret = self._obj_map.get(name)
|
43 |
+
if ret is None:
|
44 |
+
raise KeyError(
|
45 |
+
"No object named '{}' found in '{}' registry!".format(name, self._name)
|
46 |
+
)
|
47 |
+
return ret
|
48 |
+
|
49 |
+
def __contains__(self, name: str) -> bool:
|
50 |
+
return name in self._obj_map
|
51 |
+
|
52 |
+
def __repr__(self) -> str:
|
53 |
+
table_headers = ["Names", "Objects"]
|
54 |
+
table = tabulate(
|
55 |
+
self._obj_map.items(), headers=table_headers, tablefmt="fancy_grid"
|
56 |
+
)
|
57 |
+
return "Registry of {}:\n".format(self._name) + table
|
58 |
+
|
59 |
+
def __iter__(self) -> Iterator[Tuple[str, Any]]:
|
60 |
+
return iter(self._obj_map.items())
|
61 |
+
|
62 |
+
__str__ = __repr__
|
my/utils/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .event import EventStorage, get_event_storage, read_stats
|
2 |
+
from .tqdm import tqdm
|
3 |
+
from .heartbeat import HeartBeat, get_heartbeat
|
4 |
+
from .debug import EarlyLoopBreak
|
my/utils/debug.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
class EarlyLoopBreak():
|
4 |
+
def __init__(self, break_at: int):
|
5 |
+
self.iter = 0
|
6 |
+
self.break_at = break_at
|
7 |
+
self.on = bool(os.environ.get("EBREAK"))
|
8 |
+
|
9 |
+
def on_break(self):
|
10 |
+
if not self.on:
|
11 |
+
return
|
12 |
+
|
13 |
+
self.iter += 1
|
14 |
+
if self.break_at > 0 and self.iter >= self.break_at:
|
15 |
+
return True
|
my/utils/event.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# design inspiration from detectron2
|
2 |
+
from pathlib import Path
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
from contextlib import contextmanager
|
6 |
+
from .ticker import IntervalTicker
|
7 |
+
|
8 |
+
|
9 |
+
_CURRENT_STORAGE_STACK = []
|
10 |
+
|
11 |
+
|
12 |
+
def get_event_storage():
|
13 |
+
"""
|
14 |
+
Returns:
|
15 |
+
The :class:`EventStorage` object that's currently being used.
|
16 |
+
Throws an error if no :class:`EventStorage` is currently enabled.
|
17 |
+
"""
|
18 |
+
assert len(
|
19 |
+
_CURRENT_STORAGE_STACK
|
20 |
+
), "get_event_storage() has to be called inside a 'with EventStorage(...)' context!"
|
21 |
+
return _CURRENT_STORAGE_STACK[-1]
|
22 |
+
|
23 |
+
|
24 |
+
def read_lined_json(fname):
|
25 |
+
with Path(fname).open('r') as f:
|
26 |
+
for line in f:
|
27 |
+
item = json.loads(line)
|
28 |
+
yield item
|
29 |
+
|
30 |
+
|
31 |
+
def read_stats(dirname, key):
|
32 |
+
if dirname is None or not (fname := Path(dirname) / "history.json").is_file():
|
33 |
+
return [], []
|
34 |
+
stats = read_lined_json(fname)
|
35 |
+
stats = list(filter(lambda x: key in x, stats))
|
36 |
+
xs = [e['iter'] for e in stats]
|
37 |
+
ys = [e[key] for e in stats]
|
38 |
+
return xs, ys
|
39 |
+
|
40 |
+
|
41 |
+
class EventStorage():
|
42 |
+
def __init__(self, output_dir="./", start_iter=0, flush_period=60):
|
43 |
+
self.iter = start_iter
|
44 |
+
self.ticker = IntervalTicker(flush_period)
|
45 |
+
self.history = []
|
46 |
+
self._current_prefix = ""
|
47 |
+
self._init_curr_buffer_()
|
48 |
+
|
49 |
+
self.output_dir = output_dir
|
50 |
+
self.writable = False
|
51 |
+
|
52 |
+
def _open(self):
|
53 |
+
if self.writable:
|
54 |
+
output_dir = Path(self.output_dir)
|
55 |
+
if not output_dir.is_dir():
|
56 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
57 |
+
json_fname = output_dir / 'history.json'
|
58 |
+
|
59 |
+
self._file_handle = json_fname.open('a', encoding='utf8')
|
60 |
+
self.output_dir = output_dir # make sure it's a path object
|
61 |
+
|
62 |
+
def _init_curr_buffer_(self):
|
63 |
+
self.curr_buffer = {'iter': self.iter}
|
64 |
+
|
65 |
+
def step(self, flush=False):
|
66 |
+
self.history.append(self.curr_buffer)
|
67 |
+
|
68 |
+
on_flush_period = self.ticker.tick()
|
69 |
+
if flush or on_flush_period:
|
70 |
+
self.flush_history()
|
71 |
+
|
72 |
+
self.iter += 1
|
73 |
+
self._init_curr_buffer_()
|
74 |
+
|
75 |
+
def flush_history(self):
|
76 |
+
if self.writable:
|
77 |
+
for item in self.history:
|
78 |
+
line = json.dumps(item, sort_keys=True, ensure_ascii=False) + "\n"
|
79 |
+
self._file_handle.write(line)
|
80 |
+
self._file_handle.flush()
|
81 |
+
self.history = []
|
82 |
+
|
83 |
+
def full_key(self, key):
|
84 |
+
assert isinstance(key, str)
|
85 |
+
name = self._current_prefix + key
|
86 |
+
return name
|
87 |
+
|
88 |
+
def put(self, key, val):
|
89 |
+
key = self.full_key(key)
|
90 |
+
assert isinstance(val, (int, float, str))
|
91 |
+
if isinstance(val, float):
|
92 |
+
val = round(val, 3)
|
93 |
+
self.curr_buffer[key] = val
|
94 |
+
|
95 |
+
def put_scalars(self, **kwargs):
|
96 |
+
for k, v in kwargs.items():
|
97 |
+
self.put(k, v)
|
98 |
+
|
99 |
+
def put_artifact(self, key, ext, save_func):
|
100 |
+
if not self.writable:
|
101 |
+
return
|
102 |
+
os.makedirs(self.output_dir / key, exist_ok=True)
|
103 |
+
fname = (self.output_dir / key / f"step_{self.iter}").with_suffix(ext)
|
104 |
+
fname = str(fname)
|
105 |
+
|
106 |
+
# must be called inside so that
|
107 |
+
# 1. the func is not executed if the metric is not writable
|
108 |
+
# 2. the key is only inserted if the func succeeds
|
109 |
+
save_func(fname)
|
110 |
+
self.put(key, fname)
|
111 |
+
return fname
|
112 |
+
|
113 |
+
def close(self):
|
114 |
+
self.flush_history()
|
115 |
+
if self.writable:
|
116 |
+
self._file_handle.close()
|
117 |
+
|
118 |
+
def get_last(self):
|
119 |
+
if len(self.history) > 0:
|
120 |
+
last = self.history[-1]
|
121 |
+
return last
|
122 |
+
|
123 |
+
def __enter__(self):
|
124 |
+
if len(_CURRENT_STORAGE_STACK) > 0:
|
125 |
+
parent = _CURRENT_STORAGE_STACK[-1]
|
126 |
+
root, dirname = parent.output_dir, self.output_dir
|
127 |
+
if root is not None and dirname is not None:
|
128 |
+
child_dir = parent.output_dir / f"{self.output_dir}_{parent.iter}"
|
129 |
+
self.output_dir = child_dir
|
130 |
+
parent.put(str(dirname), str(child_dir))
|
131 |
+
|
132 |
+
if self.output_dir is not None:
|
133 |
+
self.writable = True
|
134 |
+
self._open()
|
135 |
+
|
136 |
+
_CURRENT_STORAGE_STACK.append(self)
|
137 |
+
return self
|
138 |
+
|
139 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
140 |
+
assert _CURRENT_STORAGE_STACK[-1] == self
|
141 |
+
_CURRENT_STORAGE_STACK.pop()
|
142 |
+
self.close()
|
my/utils/heartbeat.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# generates periodic hearbeats for remote expriment monitoring
|
2 |
+
from pathlib import Path
|
3 |
+
import json
|
4 |
+
from inspect import stack
|
5 |
+
from .ticker import IntervalTicker
|
6 |
+
|
7 |
+
_CURRENT_BEAT_STACK = []
|
8 |
+
|
9 |
+
|
10 |
+
def get_heartbeat():
|
11 |
+
"""
|
12 |
+
Returns:
|
13 |
+
The :class:`HeartBeat` object that's currently being used.
|
14 |
+
Throws an error if no :class:`EventStorage` is currently enabled.
|
15 |
+
"""
|
16 |
+
assert len(
|
17 |
+
_CURRENT_BEAT_STACK
|
18 |
+
), "get_heartbeat() has to be called inside a 'with EventStorage(...)' context!"
|
19 |
+
return _CURRENT_BEAT_STACK[-1]
|
20 |
+
|
21 |
+
|
22 |
+
def get_tqdm_meter(pbar, format_dict):
|
23 |
+
format_dict['bar_format'] = "{r_bar}"
|
24 |
+
meter_str = pbar.format_meter(**format_dict)
|
25 |
+
meter_str = meter_str[2:]
|
26 |
+
return meter_str
|
27 |
+
|
28 |
+
|
29 |
+
def caller_info(n_stack_up):
|
30 |
+
info = stack()[1 + n_stack_up] # 1 up as base so that it starts from caller
|
31 |
+
msg = f"{info.filename}:{info.lineno} - {info.function}"
|
32 |
+
return msg
|
33 |
+
|
34 |
+
|
35 |
+
class HeartBeat():
|
36 |
+
def __init__(
|
37 |
+
self, pbar, write_interval=10,
|
38 |
+
output_dir="./", fname="heartbeat.json"
|
39 |
+
):
|
40 |
+
self.pbar = pbar
|
41 |
+
self.fname = Path(output_dir) / fname
|
42 |
+
self.ticker = IntervalTicker(write_interval)
|
43 |
+
self.completed = False
|
44 |
+
|
45 |
+
# force one write at the beginning
|
46 |
+
self.beat(force_write=True, n_stack_up=2)
|
47 |
+
|
48 |
+
def beat(self, force_write=False, n_stack_up=1):
|
49 |
+
on_write_period = self.ticker.tick()
|
50 |
+
if force_write or on_write_period:
|
51 |
+
stats = self.stats()
|
52 |
+
stats['caller'] = caller_info(n_stack_up)
|
53 |
+
|
54 |
+
with open(self.fname, "w") as f:
|
55 |
+
json.dump(stats, f)
|
56 |
+
|
57 |
+
def done(self):
|
58 |
+
self.completed = True
|
59 |
+
self.beat(force_write=True, n_stack_up=2)
|
60 |
+
|
61 |
+
def stats(self):
|
62 |
+
pbar = self.pbar
|
63 |
+
fdict = pbar.format_dict
|
64 |
+
stats = {
|
65 |
+
"beat": self.ticker.tick_str(),
|
66 |
+
"done": self.completed,
|
67 |
+
"meter": get_tqdm_meter(pbar, fdict),
|
68 |
+
"elapsed": int(fdict['elapsed'])
|
69 |
+
}
|
70 |
+
return stats
|
71 |
+
|
72 |
+
def __enter__(self):
|
73 |
+
_CURRENT_BEAT_STACK.append(self)
|
74 |
+
return self
|
75 |
+
|
76 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
77 |
+
assert _CURRENT_BEAT_STACK[-1] == self
|
78 |
+
_CURRENT_BEAT_STACK.pop()
|
my/utils/plot.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
|
4 |
+
|
5 |
+
def mpl_fig_to_buffer(fig):
|
6 |
+
fig.canvas.draw()
|
7 |
+
plot = np.array(fig.canvas.renderer.buffer_rgba())
|
8 |
+
plt.close(fig)
|
9 |
+
return plot
|
my/utils/seed.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# from pytorch lightning
|
2 |
+
import random
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
|
6 |
+
max_seed_value = np.iinfo(np.uint32).max
|
7 |
+
min_seed_value = np.iinfo(np.uint32).min
|
8 |
+
|
9 |
+
|
10 |
+
def seed_everything(seed=None):
|
11 |
+
seed = int(seed)
|
12 |
+
|
13 |
+
if not (min_seed_value <= seed <= max_seed_value):
|
14 |
+
raise ValueError(f"{seed} is not in bounds, numpy accepts from {min_seed_value} to {max_seed_value}")
|
15 |
+
|
16 |
+
print(f"seed set to {seed}")
|
17 |
+
random.seed(seed)
|
18 |
+
np.random.seed(seed)
|
19 |
+
torch.manual_seed(seed)
|
20 |
+
torch.cuda.manual_seed_all(seed)
|
21 |
+
return seed
|
my/utils/ticker.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import date, time, datetime, timedelta
|
2 |
+
from time import sleep
|
3 |
+
|
4 |
+
|
5 |
+
class IntervalTicker():
|
6 |
+
def __init__(self, interval=60):
|
7 |
+
self.interval = timedelta(seconds=interval)
|
8 |
+
self.last_tick = datetime.now()
|
9 |
+
self.now = self.last_tick
|
10 |
+
|
11 |
+
def tick(self):
|
12 |
+
self.now = datetime.now()
|
13 |
+
if (self.now - self.last_tick) > self.interval:
|
14 |
+
self.last_tick = self.now
|
15 |
+
return True
|
16 |
+
|
17 |
+
def tick_str(self):
|
18 |
+
return self.now.isoformat(timespec='seconds')
|
my/utils/tqdm.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from tqdm import tqdm as orig_tqdm
|
3 |
+
|
4 |
+
|
5 |
+
def tqdm(*args, **kwargs):
|
6 |
+
is_remote = bool(os.environ.get("IS_REMOTE", False))
|
7 |
+
if is_remote:
|
8 |
+
f = open(os.devnull, "w")
|
9 |
+
kwargs.update({"file": f})
|
10 |
+
return orig_tqdm(*args, **kwargs)
|
my3d.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# some tools developed for the vision class
|
2 |
+
import numpy as np
|
3 |
+
from numpy import cross, tan
|
4 |
+
from numpy.linalg import norm, inv
|
5 |
+
|
6 |
+
|
7 |
+
def normalize(v):
|
8 |
+
return v / norm(v)
|
9 |
+
|
10 |
+
|
11 |
+
def camera_pose(eye, front, up):
|
12 |
+
z = normalize(-1 * front)
|
13 |
+
x = normalize(cross(up, z))
|
14 |
+
y = normalize(cross(z, x))
|
15 |
+
|
16 |
+
# convert to col vector
|
17 |
+
x = x.reshape(-1, 1)
|
18 |
+
y = y.reshape(-1, 1)
|
19 |
+
z = z.reshape(-1, 1)
|
20 |
+
eye = eye.reshape(-1, 1)
|
21 |
+
|
22 |
+
pose = np.block([
|
23 |
+
[x, y, z, eye],
|
24 |
+
[0, 0, 0, 1]
|
25 |
+
])
|
26 |
+
return pose
|
27 |
+
|
28 |
+
|
29 |
+
def compute_extrinsics(eye, front, up):
|
30 |
+
pose = camera_pose(eye, front, up)
|
31 |
+
world_2_cam = inv(pose)
|
32 |
+
return world_2_cam
|
33 |
+
|
34 |
+
|
35 |
+
def compute_intrinsics(aspect_ratio, fov, img_height_in_pix):
|
36 |
+
# aspect ratio is w / h
|
37 |
+
ndc = compute_proj_to_normalized(aspect_ratio, fov)
|
38 |
+
|
39 |
+
# anything beyond [-1, 1] should be discarded
|
40 |
+
# this did not mention how to do z-clipping;
|
41 |
+
|
42 |
+
ndc_to_img = compute_normalized_to_img_trans(aspect_ratio, img_height_in_pix)
|
43 |
+
intrinsic = ndc_to_img @ ndc
|
44 |
+
return intrinsic
|
45 |
+
|
46 |
+
|
47 |
+
def compute_proj_to_normalized(aspect, fov):
|
48 |
+
# compared to standard OpenGL NDC intrinsic,
|
49 |
+
# this skips the 3rd row treatment on z. hence the name partial_ndc
|
50 |
+
fov_in_rad = fov / 180 * np.pi
|
51 |
+
t = tan(fov_in_rad / 2) # tan half fov
|
52 |
+
partial_ndc_intrinsic = np.array([
|
53 |
+
[1 / (t * aspect), 0, 0, 0],
|
54 |
+
[0, 1 / t, 0, 0],
|
55 |
+
[0, 0, -1, 0] # copy the negative distance for division
|
56 |
+
])
|
57 |
+
return partial_ndc_intrinsic
|
58 |
+
|
59 |
+
|
60 |
+
def compute_normalized_to_img_trans(aspect, img_height_in_pix):
|
61 |
+
img_h = img_height_in_pix
|
62 |
+
img_w = img_height_in_pix * aspect
|
63 |
+
|
64 |
+
# note the OpenGL convention that (0, 0) sits at the center of the pixel;
|
65 |
+
# hence the extra -0.5 translation
|
66 |
+
# this is useful when you shoot rays through a pixel to the scene
|
67 |
+
ndc_to_img = np.array([
|
68 |
+
[img_w / 2, 0, img_w / 2 - 0.5],
|
69 |
+
[0, img_h / 2, img_h / 2 - 0.5],
|
70 |
+
[0, 0, 1]
|
71 |
+
])
|
72 |
+
|
73 |
+
img_y_coord_flip = np.array([
|
74 |
+
[1, 0, 0],
|
75 |
+
[0, -1, img_h - 1], # note the -1
|
76 |
+
[0, 0, 1]
|
77 |
+
])
|
78 |
+
|
79 |
+
# the product of the above 2 matrices is equivalent to adding
|
80 |
+
# - sign to the (1, 1) entry
|
81 |
+
# you could have simply written
|
82 |
+
# ndc_to_img = np.array([
|
83 |
+
# [img_w / 2, 0, img_w / 2 - 0.5],
|
84 |
+
# [0, -img_h / 2, img_h / 2 - 0.5],
|
85 |
+
# [0, 0, 1]
|
86 |
+
# ])
|
87 |
+
|
88 |
+
ndc_to_img = img_y_coord_flip @ ndc_to_img
|
89 |
+
return ndc_to_img
|
90 |
+
|
91 |
+
|
92 |
+
def unproject(K, pixel_coords, depth=1.0):
|
93 |
+
"""sometimes also referred to as backproject
|
94 |
+
pixel_coords: [n, 2] pixel locations
|
95 |
+
depth: [n,] or [,] depth value. of a shape that is broadcastable with pix coords
|
96 |
+
"""
|
97 |
+
K = K[0:3, 0:3]
|
98 |
+
|
99 |
+
pixel_coords = as_homogeneous(pixel_coords)
|
100 |
+
pixel_coords = pixel_coords.T # [2+1, n], so that mat mult is on the left
|
101 |
+
|
102 |
+
# this will give points with z = -1, which is exactly what you want since
|
103 |
+
# your camera is facing the -ve z axis
|
104 |
+
pts = inv(K) @ pixel_coords
|
105 |
+
|
106 |
+
pts = pts * depth # [3, n] * [n,] broadcast
|
107 |
+
pts = pts.T
|
108 |
+
pts = as_homogeneous(pts)
|
109 |
+
return pts
|
110 |
+
|
111 |
+
|
112 |
+
"""
|
113 |
+
these two functions are changed so that they can handle arbitrary number of
|
114 |
+
dimensions >=1
|
115 |
+
"""
|
116 |
+
|
117 |
+
|
118 |
+
def homogenize(pts):
|
119 |
+
# pts: [..., d], where last dim of the d is the diviser
|
120 |
+
*front, d = pts.shape
|
121 |
+
pts = pts / pts[..., -1].reshape(*front, 1)
|
122 |
+
return pts
|
123 |
+
|
124 |
+
|
125 |
+
def as_homogeneous(pts, lib=np):
|
126 |
+
# pts: [..., d]
|
127 |
+
*front, d = pts.shape
|
128 |
+
points = lib.ones((*front, d + 1))
|
129 |
+
points[..., :d] = pts
|
130 |
+
return points
|
131 |
+
|
132 |
+
|
133 |
+
def simple_point_render(pts, img_w, img_h, fov, eye, front, up):
|
134 |
+
"""
|
135 |
+
pts: [N, 3]
|
136 |
+
"""
|
137 |
+
canvas = np.ones((img_h, img_w, 3))
|
138 |
+
|
139 |
+
pts = as_homogeneous(pts)
|
140 |
+
|
141 |
+
E = compute_extrinsics(eye, front, up)
|
142 |
+
world_2_ndc = compute_proj_to_normalized(img_w / img_h, fov)
|
143 |
+
ndc_to_img = compute_normalized_to_img_trans(img_w / img_h, img_h)
|
144 |
+
|
145 |
+
pts = pts @ E.T
|
146 |
+
pts = pts @ world_2_ndc.T
|
147 |
+
pts = homogenize(pts)
|
148 |
+
|
149 |
+
# now filter out outliers beyond [-1, 1]
|
150 |
+
outlier_mask = (np.abs(pts) > 1.0).any(axis=1)
|
151 |
+
pts = pts[~outlier_mask]
|
152 |
+
|
153 |
+
pts = pts @ ndc_to_img.T
|
154 |
+
|
155 |
+
# now draw each point
|
156 |
+
pts = np.rint(pts).astype(np.int32)
|
157 |
+
xs, ys, _ = pts.T
|
158 |
+
canvas[ys, xs] = (1, 0, 0)
|
159 |
+
|
160 |
+
return canvas
|
ncsn/__init__.py
ADDED
File without changes
|
ncsn/bedroom.yml
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
training:
|
2 |
+
batch_size: 128
|
3 |
+
n_epochs: 500000
|
4 |
+
n_iters: 150001
|
5 |
+
snapshot_freq: 5000
|
6 |
+
snapshot_sampling: true
|
7 |
+
anneal_power: 2
|
8 |
+
log_all_sigmas: false
|
9 |
+
|
10 |
+
sampling:
|
11 |
+
batch_size: 36
|
12 |
+
data_init: false
|
13 |
+
step_lr: 0.0000018
|
14 |
+
n_steps_each: 3
|
15 |
+
ckpt_id: 150000
|
16 |
+
final_only: true
|
17 |
+
fid: false
|
18 |
+
denoise: true
|
19 |
+
num_samples4fid: 10000
|
20 |
+
inpainting: false
|
21 |
+
interpolation: false
|
22 |
+
n_interpolations: 10
|
23 |
+
|
24 |
+
fast_fid:
|
25 |
+
batch_size: 1000
|
26 |
+
num_samples: 1000
|
27 |
+
step_lr: 0.0000018
|
28 |
+
n_steps_each: 3
|
29 |
+
begin_ckpt: 100000
|
30 |
+
end_ckpt: 150000
|
31 |
+
verbose: false
|
32 |
+
ensemble: false
|
33 |
+
|
34 |
+
test:
|
35 |
+
begin_ckpt: 5000
|
36 |
+
end_ckpt: 150000
|
37 |
+
batch_size: 100
|
38 |
+
|
39 |
+
data:
|
40 |
+
dataset: "LSUN"
|
41 |
+
category: "bedroom"
|
42 |
+
image_size: 128
|
43 |
+
channels: 3
|
44 |
+
logit_transform: false
|
45 |
+
uniform_dequantization: false
|
46 |
+
gaussian_dequantization: false
|
47 |
+
random_flip: true
|
48 |
+
rescaled: false
|
49 |
+
num_workers: 32
|
50 |
+
|
51 |
+
model:
|
52 |
+
sigma_begin: 190
|
53 |
+
num_classes: 1086
|
54 |
+
ema: true
|
55 |
+
ema_rate: 0.999
|
56 |
+
spec_norm: false
|
57 |
+
sigma_dist: geometric
|
58 |
+
sigma_end: 0.01
|
59 |
+
normalization: InstanceNorm++
|
60 |
+
nonlinearity: elu
|
61 |
+
ngf: 128
|
62 |
+
|
63 |
+
optim:
|
64 |
+
weight_decay: 0.000
|
65 |
+
optimizer: "Adam"
|
66 |
+
lr: 0.0001
|
67 |
+
beta1: 0.9
|
68 |
+
amsgrad: false
|
69 |
+
eps: 0.00000001
|
ncsn/ema.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import torch.nn as nn
|
3 |
+
|
4 |
+
class EMAHelper(object):
|
5 |
+
def __init__(self, mu=0.999):
|
6 |
+
self.mu = mu
|
7 |
+
self.shadow = {}
|
8 |
+
|
9 |
+
def register(self, module):
|
10 |
+
if isinstance(module, nn.DataParallel):
|
11 |
+
module = module.module
|
12 |
+
for name, param in module.named_parameters():
|
13 |
+
if param.requires_grad:
|
14 |
+
self.shadow[name] = param.data.clone()
|
15 |
+
|
16 |
+
def update(self, module):
|
17 |
+
if isinstance(module, nn.DataParallel):
|
18 |
+
module = module.module
|
19 |
+
for name, param in module.named_parameters():
|
20 |
+
if param.requires_grad:
|
21 |
+
self.shadow[name].data = (1. - self.mu) * param.data + self.mu * self.shadow[name].data
|
22 |
+
|
23 |
+
def ema(self, module):
|
24 |
+
if isinstance(module, nn.DataParallel):
|
25 |
+
module = module.module
|
26 |
+
for name, param in module.named_parameters():
|
27 |
+
if param.requires_grad:
|
28 |
+
param.data.copy_(self.shadow[name].data)
|
29 |
+
|
30 |
+
def ema_copy(self, module):
|
31 |
+
if isinstance(module, nn.DataParallel):
|
32 |
+
inner_module = module.module
|
33 |
+
module_copy = type(inner_module)(inner_module.config).to(inner_module.config.device)
|
34 |
+
module_copy.load_state_dict(inner_module.state_dict())
|
35 |
+
module_copy = nn.DataParallel(module_copy)
|
36 |
+
else:
|
37 |
+
module_copy = type(module)(module.config).to(module.config.device)
|
38 |
+
module_copy.load_state_dict(module.state_dict())
|
39 |
+
# module_copy = copy.deepcopy(module)
|
40 |
+
self.ema(module_copy)
|
41 |
+
return module_copy
|
42 |
+
|
43 |
+
def state_dict(self):
|
44 |
+
return self.shadow
|
45 |
+
|
46 |
+
def load_state_dict(self, state_dict):
|
47 |
+
self.shadow = state_dict
|
ncsn/layers.py
ADDED
@@ -0,0 +1,456 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
import torch
|
3 |
+
from torch.nn.parameter import Parameter
|
4 |
+
import torch.nn.functional as F
|
5 |
+
from .normalization import *
|
6 |
+
from functools import partial
|
7 |
+
import math
|
8 |
+
import torch.nn.init as init
|
9 |
+
|
10 |
+
|
11 |
+
def get_act(config):
|
12 |
+
if config.model.nonlinearity.lower() == 'elu':
|
13 |
+
return nn.ELU()
|
14 |
+
elif config.model.nonlinearity.lower() == 'relu':
|
15 |
+
return nn.ReLU()
|
16 |
+
elif config.model.nonlinearity.lower() == 'lrelu':
|
17 |
+
return nn.LeakyReLU(negative_slope=0.2)
|
18 |
+
elif config.model.nonlinearity.lower() == 'swish':
|
19 |
+
def swish(x):
|
20 |
+
return x * torch.sigmoid(x)
|
21 |
+
return swish
|
22 |
+
else:
|
23 |
+
raise NotImplementedError('activation function does not exist!')
|
24 |
+
|
25 |
+
def spectral_norm(layer, n_iters=1):
|
26 |
+
return torch.nn.utils.spectral_norm(layer, n_power_iterations=n_iters)
|
27 |
+
|
28 |
+
def conv1x1(in_planes, out_planes, stride=1, bias=True, spec_norm=False):
|
29 |
+
"1x1 convolution"
|
30 |
+
conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride,
|
31 |
+
padding=0, bias=bias)
|
32 |
+
if spec_norm:
|
33 |
+
conv = spectral_norm(conv)
|
34 |
+
return conv
|
35 |
+
|
36 |
+
|
37 |
+
def conv3x3(in_planes, out_planes, stride=1, bias=True, spec_norm=False):
|
38 |
+
"3x3 convolution with padding"
|
39 |
+
conv = nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
|
40 |
+
padding=1, bias=bias)
|
41 |
+
if spec_norm:
|
42 |
+
conv = spectral_norm(conv)
|
43 |
+
|
44 |
+
return conv
|
45 |
+
|
46 |
+
|
47 |
+
def stride_conv3x3(in_planes, out_planes, kernel_size, bias=True, spec_norm=False):
|
48 |
+
conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=2,
|
49 |
+
padding=kernel_size // 2, bias=bias)
|
50 |
+
if spec_norm:
|
51 |
+
conv = spectral_norm(conv)
|
52 |
+
return conv
|
53 |
+
|
54 |
+
|
55 |
+
def dilated_conv3x3(in_planes, out_planes, dilation, bias=True, spec_norm=False):
|
56 |
+
conv = nn.Conv2d(in_planes, out_planes, kernel_size=3, padding=dilation, dilation=dilation, bias=bias)
|
57 |
+
if spec_norm:
|
58 |
+
conv = spectral_norm(conv)
|
59 |
+
|
60 |
+
return conv
|
61 |
+
|
62 |
+
class CRPBlock(nn.Module):
|
63 |
+
def __init__(self, features, n_stages, act=nn.ReLU(), maxpool=True, spec_norm=False):
|
64 |
+
super().__init__()
|
65 |
+
self.convs = nn.ModuleList()
|
66 |
+
for i in range(n_stages):
|
67 |
+
self.convs.append(conv3x3(features, features, stride=1, bias=False, spec_norm=spec_norm))
|
68 |
+
self.n_stages = n_stages
|
69 |
+
if maxpool:
|
70 |
+
self.maxpool = nn.MaxPool2d(kernel_size=5, stride=1, padding=2)
|
71 |
+
else:
|
72 |
+
self.maxpool = nn.AvgPool2d(kernel_size=5, stride=1, padding=2)
|
73 |
+
|
74 |
+
self.act = act
|
75 |
+
|
76 |
+
def forward(self, x):
|
77 |
+
x = self.act(x)
|
78 |
+
path = x
|
79 |
+
for i in range(self.n_stages):
|
80 |
+
path = self.maxpool(path)
|
81 |
+
path = self.convs[i](path)
|
82 |
+
x = path + x
|
83 |
+
return x
|
84 |
+
|
85 |
+
|
86 |
+
class CondCRPBlock(nn.Module):
|
87 |
+
def __init__(self, features, n_stages, num_classes, normalizer, act=nn.ReLU(), spec_norm=False):
|
88 |
+
super().__init__()
|
89 |
+
self.convs = nn.ModuleList()
|
90 |
+
self.norms = nn.ModuleList()
|
91 |
+
self.normalizer = normalizer
|
92 |
+
for i in range(n_stages):
|
93 |
+
self.norms.append(normalizer(features, num_classes, bias=True))
|
94 |
+
self.convs.append(conv3x3(features, features, stride=1, bias=False, spec_norm=spec_norm))
|
95 |
+
|
96 |
+
self.n_stages = n_stages
|
97 |
+
self.maxpool = nn.AvgPool2d(kernel_size=5, stride=1, padding=2)
|
98 |
+
self.act = act
|
99 |
+
|
100 |
+
def forward(self, x, y):
|
101 |
+
x = self.act(x)
|
102 |
+
path = x
|
103 |
+
for i in range(self.n_stages):
|
104 |
+
path = self.norms[i](path, y)
|
105 |
+
path = self.maxpool(path)
|
106 |
+
path = self.convs[i](path)
|
107 |
+
|
108 |
+
x = path + x
|
109 |
+
return x
|
110 |
+
|
111 |
+
|
112 |
+
class RCUBlock(nn.Module):
|
113 |
+
def __init__(self, features, n_blocks, n_stages, act=nn.ReLU(), spec_norm=False):
|
114 |
+
super().__init__()
|
115 |
+
|
116 |
+
for i in range(n_blocks):
|
117 |
+
for j in range(n_stages):
|
118 |
+
setattr(self, '{}_{}_conv'.format(i + 1, j + 1), conv3x3(features, features, stride=1, bias=False,
|
119 |
+
spec_norm=spec_norm))
|
120 |
+
|
121 |
+
self.stride = 1
|
122 |
+
self.n_blocks = n_blocks
|
123 |
+
self.n_stages = n_stages
|
124 |
+
self.act = act
|
125 |
+
|
126 |
+
def forward(self, x):
|
127 |
+
for i in range(self.n_blocks):
|
128 |
+
residual = x
|
129 |
+
for j in range(self.n_stages):
|
130 |
+
x = self.act(x)
|
131 |
+
x = getattr(self, '{}_{}_conv'.format(i + 1, j + 1))(x)
|
132 |
+
|
133 |
+
x += residual
|
134 |
+
return x
|
135 |
+
|
136 |
+
|
137 |
+
class CondRCUBlock(nn.Module):
|
138 |
+
def __init__(self, features, n_blocks, n_stages, num_classes, normalizer, act=nn.ReLU(), spec_norm=False):
|
139 |
+
super().__init__()
|
140 |
+
|
141 |
+
for i in range(n_blocks):
|
142 |
+
for j in range(n_stages):
|
143 |
+
setattr(self, '{}_{}_norm'.format(i + 1, j + 1), normalizer(features, num_classes, bias=True))
|
144 |
+
setattr(self, '{}_{}_conv'.format(i + 1, j + 1),
|
145 |
+
conv3x3(features, features, stride=1, bias=False, spec_norm=spec_norm))
|
146 |
+
|
147 |
+
self.stride = 1
|
148 |
+
self.n_blocks = n_blocks
|
149 |
+
self.n_stages = n_stages
|
150 |
+
self.act = act
|
151 |
+
self.normalizer = normalizer
|
152 |
+
|
153 |
+
def forward(self, x, y):
|
154 |
+
for i in range(self.n_blocks):
|
155 |
+
residual = x
|
156 |
+
for j in range(self.n_stages):
|
157 |
+
x = getattr(self, '{}_{}_norm'.format(i + 1, j + 1))(x, y)
|
158 |
+
x = self.act(x)
|
159 |
+
x = getattr(self, '{}_{}_conv'.format(i + 1, j + 1))(x)
|
160 |
+
|
161 |
+
x += residual
|
162 |
+
return x
|
163 |
+
|
164 |
+
|
165 |
+
class MSFBlock(nn.Module):
|
166 |
+
def __init__(self, in_planes, features, spec_norm=False):
|
167 |
+
"""
|
168 |
+
:param in_planes: tuples of input planes
|
169 |
+
"""
|
170 |
+
super().__init__()
|
171 |
+
assert isinstance(in_planes, list) or isinstance(in_planes, tuple)
|
172 |
+
self.convs = nn.ModuleList()
|
173 |
+
self.features = features
|
174 |
+
|
175 |
+
for i in range(len(in_planes)):
|
176 |
+
self.convs.append(conv3x3(in_planes[i], features, stride=1, bias=True, spec_norm=spec_norm))
|
177 |
+
|
178 |
+
def forward(self, xs, shape):
|
179 |
+
sums = torch.zeros(xs[0].shape[0], self.features, *shape, device=xs[0].device)
|
180 |
+
for i in range(len(self.convs)):
|
181 |
+
h = self.convs[i](xs[i])
|
182 |
+
h = F.interpolate(h, size=shape, mode='bilinear', align_corners=True)
|
183 |
+
sums += h
|
184 |
+
return sums
|
185 |
+
|
186 |
+
|
187 |
+
class CondMSFBlock(nn.Module):
|
188 |
+
def __init__(self, in_planes, features, num_classes, normalizer, spec_norm=False):
|
189 |
+
"""
|
190 |
+
:param in_planes: tuples of input planes
|
191 |
+
"""
|
192 |
+
super().__init__()
|
193 |
+
assert isinstance(in_planes, list) or isinstance(in_planes, tuple)
|
194 |
+
|
195 |
+
self.convs = nn.ModuleList()
|
196 |
+
self.norms = nn.ModuleList()
|
197 |
+
self.features = features
|
198 |
+
self.normalizer = normalizer
|
199 |
+
|
200 |
+
for i in range(len(in_planes)):
|
201 |
+
self.convs.append(conv3x3(in_planes[i], features, stride=1, bias=True, spec_norm=spec_norm))
|
202 |
+
self.norms.append(normalizer(in_planes[i], num_classes, bias=True))
|
203 |
+
|
204 |
+
def forward(self, xs, y, shape):
|
205 |
+
sums = torch.zeros(xs[0].shape[0], self.features, *shape, device=xs[0].device)
|
206 |
+
for i in range(len(self.convs)):
|
207 |
+
h = self.norms[i](xs[i], y)
|
208 |
+
h = self.convs[i](h)
|
209 |
+
h = F.interpolate(h, size=shape, mode='bilinear', align_corners=True)
|
210 |
+
sums += h
|
211 |
+
return sums
|
212 |
+
|
213 |
+
|
214 |
+
class RefineBlock(nn.Module):
|
215 |
+
def __init__(self, in_planes, features, act=nn.ReLU(), start=False, end=False, maxpool=True, spec_norm=False):
|
216 |
+
super().__init__()
|
217 |
+
|
218 |
+
assert isinstance(in_planes, tuple) or isinstance(in_planes, list)
|
219 |
+
self.n_blocks = n_blocks = len(in_planes)
|
220 |
+
|
221 |
+
self.adapt_convs = nn.ModuleList()
|
222 |
+
for i in range(n_blocks):
|
223 |
+
self.adapt_convs.append(
|
224 |
+
RCUBlock(in_planes[i], 2, 2, act, spec_norm=spec_norm)
|
225 |
+
)
|
226 |
+
|
227 |
+
self.output_convs = RCUBlock(features, 3 if end else 1, 2, act, spec_norm=spec_norm)
|
228 |
+
|
229 |
+
if not start:
|
230 |
+
self.msf = MSFBlock(in_planes, features, spec_norm=spec_norm)
|
231 |
+
|
232 |
+
self.crp = CRPBlock(features, 2, act, maxpool=maxpool, spec_norm=spec_norm)
|
233 |
+
|
234 |
+
def forward(self, xs, output_shape):
|
235 |
+
assert isinstance(xs, tuple) or isinstance(xs, list)
|
236 |
+
hs = []
|
237 |
+
for i in range(len(xs)):
|
238 |
+
h = self.adapt_convs[i](xs[i])
|
239 |
+
hs.append(h)
|
240 |
+
|
241 |
+
if self.n_blocks > 1:
|
242 |
+
h = self.msf(hs, output_shape)
|
243 |
+
else:
|
244 |
+
h = hs[0]
|
245 |
+
|
246 |
+
h = self.crp(h)
|
247 |
+
h = self.output_convs(h)
|
248 |
+
|
249 |
+
return h
|
250 |
+
|
251 |
+
|
252 |
+
|
253 |
+
class CondRefineBlock(nn.Module):
|
254 |
+
def __init__(self, in_planes, features, num_classes, normalizer, act=nn.ReLU(), start=False, end=False, spec_norm=False):
|
255 |
+
super().__init__()
|
256 |
+
|
257 |
+
assert isinstance(in_planes, tuple) or isinstance(in_planes, list)
|
258 |
+
self.n_blocks = n_blocks = len(in_planes)
|
259 |
+
|
260 |
+
self.adapt_convs = nn.ModuleList()
|
261 |
+
for i in range(n_blocks):
|
262 |
+
self.adapt_convs.append(
|
263 |
+
CondRCUBlock(in_planes[i], 2, 2, num_classes, normalizer, act, spec_norm=spec_norm)
|
264 |
+
)
|
265 |
+
|
266 |
+
self.output_convs = CondRCUBlock(features, 3 if end else 1, 2, num_classes, normalizer, act, spec_norm=spec_norm)
|
267 |
+
|
268 |
+
if not start:
|
269 |
+
self.msf = CondMSFBlock(in_planes, features, num_classes, normalizer, spec_norm=spec_norm)
|
270 |
+
|
271 |
+
self.crp = CondCRPBlock(features, 2, num_classes, normalizer, act, spec_norm=spec_norm)
|
272 |
+
|
273 |
+
def forward(self, xs, y, output_shape):
|
274 |
+
assert isinstance(xs, tuple) or isinstance(xs, list)
|
275 |
+
hs = []
|
276 |
+
for i in range(len(xs)):
|
277 |
+
h = self.adapt_convs[i](xs[i], y)
|
278 |
+
hs.append(h)
|
279 |
+
|
280 |
+
if self.n_blocks > 1:
|
281 |
+
h = self.msf(hs, y, output_shape)
|
282 |
+
else:
|
283 |
+
h = hs[0]
|
284 |
+
|
285 |
+
h = self.crp(h, y)
|
286 |
+
h = self.output_convs(h, y)
|
287 |
+
|
288 |
+
return h
|
289 |
+
|
290 |
+
|
291 |
+
class ConvMeanPool(nn.Module):
|
292 |
+
def __init__(self, input_dim, output_dim, kernel_size=3, biases=True, adjust_padding=False, spec_norm=False):
|
293 |
+
super().__init__()
|
294 |
+
if not adjust_padding:
|
295 |
+
conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
|
296 |
+
if spec_norm:
|
297 |
+
conv = spectral_norm(conv)
|
298 |
+
self.conv = conv
|
299 |
+
else:
|
300 |
+
conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
|
301 |
+
if spec_norm:
|
302 |
+
conv = spectral_norm(conv)
|
303 |
+
|
304 |
+
self.conv = nn.Sequential(
|
305 |
+
nn.ZeroPad2d((1, 0, 1, 0)),
|
306 |
+
conv
|
307 |
+
)
|
308 |
+
|
309 |
+
def forward(self, inputs):
|
310 |
+
output = self.conv(inputs)
|
311 |
+
output = sum([output[:, :, ::2, ::2], output[:, :, 1::2, ::2],
|
312 |
+
output[:, :, ::2, 1::2], output[:, :, 1::2, 1::2]]) / 4.
|
313 |
+
return output
|
314 |
+
|
315 |
+
class MeanPoolConv(nn.Module):
|
316 |
+
def __init__(self, input_dim, output_dim, kernel_size=3, biases=True, spec_norm=False):
|
317 |
+
super().__init__()
|
318 |
+
self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
|
319 |
+
if spec_norm:
|
320 |
+
self.conv = spectral_norm(self.conv)
|
321 |
+
|
322 |
+
def forward(self, inputs):
|
323 |
+
output = inputs
|
324 |
+
output = sum([output[:, :, ::2, ::2], output[:, :, 1::2, ::2],
|
325 |
+
output[:, :, ::2, 1::2], output[:, :, 1::2, 1::2]]) / 4.
|
326 |
+
return self.conv(output)
|
327 |
+
|
328 |
+
|
329 |
+
class UpsampleConv(nn.Module):
|
330 |
+
def __init__(self, input_dim, output_dim, kernel_size=3, biases=True, spec_norm=False):
|
331 |
+
super().__init__()
|
332 |
+
self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
|
333 |
+
if spec_norm:
|
334 |
+
self.conv = spectral_norm(self.conv)
|
335 |
+
self.pixelshuffle = nn.PixelShuffle(upscale_factor=2)
|
336 |
+
|
337 |
+
def forward(self, inputs):
|
338 |
+
output = inputs
|
339 |
+
output = torch.cat([output, output, output, output], dim=1)
|
340 |
+
output = self.pixelshuffle(output)
|
341 |
+
return self.conv(output)
|
342 |
+
|
343 |
+
|
344 |
+
class ConditionalResidualBlock(nn.Module):
|
345 |
+
def __init__(self, input_dim, output_dim, num_classes, resample=None, act=nn.ELU(),
|
346 |
+
normalization=ConditionalBatchNorm2d, adjust_padding=False, dilation=None, spec_norm=False):
|
347 |
+
super().__init__()
|
348 |
+
self.non_linearity = act
|
349 |
+
self.input_dim = input_dim
|
350 |
+
self.output_dim = output_dim
|
351 |
+
self.resample = resample
|
352 |
+
self.normalization = normalization
|
353 |
+
if resample == 'down':
|
354 |
+
if dilation is not None:
|
355 |
+
self.conv1 = dilated_conv3x3(input_dim, input_dim, dilation=dilation, spec_norm=spec_norm)
|
356 |
+
self.normalize2 = normalization(input_dim, num_classes)
|
357 |
+
self.conv2 = dilated_conv3x3(input_dim, output_dim, dilation=dilation, spec_norm=spec_norm)
|
358 |
+
conv_shortcut = partial(dilated_conv3x3, dilation=dilation, spec_norm=spec_norm)
|
359 |
+
else:
|
360 |
+
self.conv1 = conv3x3(input_dim, input_dim, spec_norm=spec_norm)
|
361 |
+
self.normalize2 = normalization(input_dim, num_classes)
|
362 |
+
self.conv2 = ConvMeanPool(input_dim, output_dim, 3, adjust_padding=adjust_padding, spec_norm=spec_norm)
|
363 |
+
conv_shortcut = partial(ConvMeanPool, kernel_size=1, adjust_padding=adjust_padding, spec_norm=spec_norm)
|
364 |
+
|
365 |
+
elif resample is None:
|
366 |
+
if dilation is not None:
|
367 |
+
conv_shortcut = partial(dilated_conv3x3, dilation=dilation, spec_norm=spec_norm)
|
368 |
+
self.conv1 = dilated_conv3x3(input_dim, output_dim, dilation=dilation, spec_norm=spec_norm)
|
369 |
+
self.normalize2 = normalization(output_dim, num_classes)
|
370 |
+
self.conv2 = dilated_conv3x3(output_dim, output_dim, dilation=dilation, spec_norm=spec_norm)
|
371 |
+
else:
|
372 |
+
conv_shortcut = nn.Conv2d
|
373 |
+
self.conv1 = conv3x3(input_dim, output_dim, spec_norm=spec_norm)
|
374 |
+
self.normalize2 = normalization(output_dim, num_classes)
|
375 |
+
self.conv2 = conv3x3(output_dim, output_dim, spec_norm=spec_norm)
|
376 |
+
else:
|
377 |
+
raise Exception('invalid resample value')
|
378 |
+
|
379 |
+
if output_dim != input_dim or resample is not None:
|
380 |
+
self.shortcut = conv_shortcut(input_dim, output_dim)
|
381 |
+
|
382 |
+
self.normalize1 = normalization(input_dim, num_classes)
|
383 |
+
|
384 |
+
|
385 |
+
def forward(self, x, y):
|
386 |
+
output = self.normalize1(x, y)
|
387 |
+
output = self.non_linearity(output)
|
388 |
+
output = self.conv1(output)
|
389 |
+
output = self.normalize2(output, y)
|
390 |
+
output = self.non_linearity(output)
|
391 |
+
output = self.conv2(output)
|
392 |
+
|
393 |
+
if self.output_dim == self.input_dim and self.resample is None:
|
394 |
+
shortcut = x
|
395 |
+
else:
|
396 |
+
shortcut = self.shortcut(x)
|
397 |
+
|
398 |
+
return shortcut + output
|
399 |
+
|
400 |
+
|
401 |
+
class ResidualBlock(nn.Module):
|
402 |
+
def __init__(self, input_dim, output_dim, resample=None, act=nn.ELU(),
|
403 |
+
normalization=nn.BatchNorm2d, adjust_padding=False, dilation=None, spec_norm=False):
|
404 |
+
super().__init__()
|
405 |
+
self.non_linearity = act
|
406 |
+
self.input_dim = input_dim
|
407 |
+
self.output_dim = output_dim
|
408 |
+
self.resample = resample
|
409 |
+
self.normalization = normalization
|
410 |
+
if resample == 'down':
|
411 |
+
if dilation is not None:
|
412 |
+
self.conv1 = dilated_conv3x3(input_dim, input_dim, dilation=dilation, spec_norm=spec_norm)
|
413 |
+
self.normalize2 = normalization(input_dim)
|
414 |
+
self.conv2 = dilated_conv3x3(input_dim, output_dim, dilation=dilation, spec_norm=spec_norm)
|
415 |
+
conv_shortcut = partial(dilated_conv3x3, dilation=dilation, spec_norm=spec_norm)
|
416 |
+
else:
|
417 |
+
self.conv1 = conv3x3(input_dim, input_dim, spec_norm=spec_norm)
|
418 |
+
self.normalize2 = normalization(input_dim)
|
419 |
+
self.conv2 = ConvMeanPool(input_dim, output_dim, 3, adjust_padding=adjust_padding, spec_norm=spec_norm)
|
420 |
+
conv_shortcut = partial(ConvMeanPool, kernel_size=1, adjust_padding=adjust_padding, spec_norm=spec_norm)
|
421 |
+
|
422 |
+
elif resample is None:
|
423 |
+
if dilation is not None:
|
424 |
+
conv_shortcut = partial(dilated_conv3x3, dilation=dilation, spec_norm=spec_norm)
|
425 |
+
self.conv1 = dilated_conv3x3(input_dim, output_dim, dilation=dilation, spec_norm=spec_norm)
|
426 |
+
self.normalize2 = normalization(output_dim)
|
427 |
+
self.conv2 = dilated_conv3x3(output_dim, output_dim, dilation=dilation, spec_norm=spec_norm)
|
428 |
+
else:
|
429 |
+
# conv_shortcut = nn.Conv2d ### Something wierd here.
|
430 |
+
conv_shortcut = partial(conv1x1, spec_norm=spec_norm)
|
431 |
+
self.conv1 = conv3x3(input_dim, output_dim, spec_norm=spec_norm)
|
432 |
+
self.normalize2 = normalization(output_dim)
|
433 |
+
self.conv2 = conv3x3(output_dim, output_dim, spec_norm=spec_norm)
|
434 |
+
else:
|
435 |
+
raise Exception('invalid resample value')
|
436 |
+
|
437 |
+
if output_dim != input_dim or resample is not None:
|
438 |
+
self.shortcut = conv_shortcut(input_dim, output_dim)
|
439 |
+
|
440 |
+
self.normalize1 = normalization(input_dim)
|
441 |
+
|
442 |
+
|
443 |
+
def forward(self, x):
|
444 |
+
output = self.normalize1(x)
|
445 |
+
output = self.non_linearity(output)
|
446 |
+
output = self.conv1(output)
|
447 |
+
output = self.normalize2(output)
|
448 |
+
output = self.non_linearity(output)
|
449 |
+
output = self.conv2(output)
|
450 |
+
|
451 |
+
if self.output_dim == self.input_dim and self.resample is None:
|
452 |
+
shortcut = x
|
453 |
+
else:
|
454 |
+
shortcut = self.shortcut(x)
|
455 |
+
|
456 |
+
return shortcut + output
|
ncsn/ncsnv2.py
ADDED
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
import numpy as np
|
3 |
+
import torch.nn.functional as F
|
4 |
+
import torch
|
5 |
+
from functools import partial
|
6 |
+
from .layers import *
|
7 |
+
from .normalization import get_normalization
|
8 |
+
|
9 |
+
|
10 |
+
def get_sigmas(config):
|
11 |
+
if config.model.sigma_dist == 'geometric':
|
12 |
+
sigmas = torch.tensor(
|
13 |
+
np.exp(np.linspace(np.log(config.model.sigma_begin), np.log(config.model.sigma_end),
|
14 |
+
config.model.num_classes))).float().to(config.device)
|
15 |
+
elif config.model.sigma_dist == 'uniform':
|
16 |
+
sigmas = torch.tensor(
|
17 |
+
np.linspace(config.model.sigma_begin, config.model.sigma_end, config.model.num_classes)
|
18 |
+
).float().to(config.device)
|
19 |
+
|
20 |
+
else:
|
21 |
+
raise NotImplementedError('sigma distribution not supported')
|
22 |
+
|
23 |
+
return sigmas
|
24 |
+
|
25 |
+
|
26 |
+
class NCSNv2(nn.Module):
|
27 |
+
def __init__(self, config):
|
28 |
+
super().__init__()
|
29 |
+
self.logit_transform = config.data.logit_transform
|
30 |
+
self.rescaled = config.data.rescaled
|
31 |
+
self.norm = get_normalization(config, conditional=False)
|
32 |
+
self.ngf = ngf = config.model.ngf
|
33 |
+
self.num_classes = num_classes = config.model.num_classes
|
34 |
+
|
35 |
+
self.act = act = get_act(config)
|
36 |
+
self.register_buffer('sigmas', get_sigmas(config))
|
37 |
+
self.config = config
|
38 |
+
|
39 |
+
self.begin_conv = nn.Conv2d(config.data.channels, ngf, 3, stride=1, padding=1)
|
40 |
+
|
41 |
+
self.normalizer = self.norm(ngf, self.num_classes)
|
42 |
+
self.end_conv = nn.Conv2d(ngf, config.data.channels, 3, stride=1, padding=1)
|
43 |
+
|
44 |
+
self.res1 = nn.ModuleList([
|
45 |
+
ResidualBlock(self.ngf, self.ngf, resample=None, act=act,
|
46 |
+
normalization=self.norm),
|
47 |
+
ResidualBlock(self.ngf, self.ngf, resample=None, act=act,
|
48 |
+
normalization=self.norm)]
|
49 |
+
)
|
50 |
+
|
51 |
+
self.res2 = nn.ModuleList([
|
52 |
+
ResidualBlock(self.ngf, 2 * self.ngf, resample='down', act=act,
|
53 |
+
normalization=self.norm),
|
54 |
+
ResidualBlock(2 * self.ngf, 2 * self.ngf, resample=None, act=act,
|
55 |
+
normalization=self.norm)]
|
56 |
+
)
|
57 |
+
|
58 |
+
self.res3 = nn.ModuleList([
|
59 |
+
ResidualBlock(2 * self.ngf, 2 * self.ngf, resample='down', act=act,
|
60 |
+
normalization=self.norm, dilation=2),
|
61 |
+
ResidualBlock(2 * self.ngf, 2 * self.ngf, resample=None, act=act,
|
62 |
+
normalization=self.norm, dilation=2)]
|
63 |
+
)
|
64 |
+
|
65 |
+
if config.data.image_size == 28:
|
66 |
+
self.res4 = nn.ModuleList([
|
67 |
+
ResidualBlock(2 * self.ngf, 2 * self.ngf, resample='down', act=act,
|
68 |
+
normalization=self.norm, adjust_padding=True, dilation=4),
|
69 |
+
ResidualBlock(2 * self.ngf, 2 * self.ngf, resample=None, act=act,
|
70 |
+
normalization=self.norm, dilation=4)]
|
71 |
+
)
|
72 |
+
else:
|
73 |
+
self.res4 = nn.ModuleList([
|
74 |
+
ResidualBlock(2 * self.ngf, 2 * self.ngf, resample='down', act=act,
|
75 |
+
normalization=self.norm, adjust_padding=False, dilation=4),
|
76 |
+
ResidualBlock(2 * self.ngf, 2 * self.ngf, resample=None, act=act,
|
77 |
+
normalization=self.norm, dilation=4)]
|
78 |
+
)
|
79 |
+
|
80 |
+
self.refine1 = RefineBlock([2 * self.ngf], 2 * self.ngf, act=act, start=True)
|
81 |
+
self.refine2 = RefineBlock([2 * self.ngf, 2 * self.ngf], 2 * self.ngf, act=act)
|
82 |
+
self.refine3 = RefineBlock([2 * self.ngf, 2 * self.ngf], self.ngf, act=act)
|
83 |
+
self.refine4 = RefineBlock([self.ngf, self.ngf], self.ngf, act=act, end=True)
|
84 |
+
|
85 |
+
def _compute_cond_module(self, module, x):
|
86 |
+
for m in module:
|
87 |
+
x = m(x)
|
88 |
+
return x
|
89 |
+
|
90 |
+
def forward(self, x, y):
|
91 |
+
if not self.logit_transform and not self.rescaled:
|
92 |
+
h = 2 * x - 1.
|
93 |
+
else:
|
94 |
+
h = x
|
95 |
+
|
96 |
+
output = self.begin_conv(h)
|
97 |
+
|
98 |
+
layer1 = self._compute_cond_module(self.res1, output)
|
99 |
+
layer2 = self._compute_cond_module(self.res2, layer1)
|
100 |
+
layer3 = self._compute_cond_module(self.res3, layer2)
|
101 |
+
layer4 = self._compute_cond_module(self.res4, layer3)
|
102 |
+
|
103 |
+
ref1 = self.refine1([layer4], layer4.shape[2:])
|
104 |
+
ref2 = self.refine2([layer3, ref1], layer3.shape[2:])
|
105 |
+
ref3 = self.refine3([layer2, ref2], layer2.shape[2:])
|
106 |
+
output = self.refine4([layer1, ref3], layer1.shape[2:])
|
107 |
+
|
108 |
+
output = self.normalizer(output)
|
109 |
+
output = self.act(output)
|
110 |
+
output = self.end_conv(output)
|
111 |
+
|
112 |
+
used_sigmas = self.sigmas[y].view(x.shape[0], *([1] * len(x.shape[1:])))
|
113 |
+
|
114 |
+
output = output / used_sigmas
|
115 |
+
|
116 |
+
return output
|
117 |
+
|
118 |
+
|
119 |
+
class NCSNv2Deeper(nn.Module):
|
120 |
+
def __init__(self, config):
|
121 |
+
super().__init__()
|
122 |
+
self.logit_transform = config.data.logit_transform
|
123 |
+
self.rescaled = config.data.rescaled
|
124 |
+
self.norm = get_normalization(config, conditional=False)
|
125 |
+
self.ngf = ngf = config.model.ngf
|
126 |
+
self.num_classes = config.model.num_classes
|
127 |
+
self.act = act = get_act(config)
|
128 |
+
self.register_buffer('sigmas', get_sigmas(config))
|
129 |
+
self.config = config
|
130 |
+
|
131 |
+
self.begin_conv = nn.Conv2d(config.data.channels, ngf, 3, stride=1, padding=1)
|
132 |
+
self.normalizer = self.norm(ngf, self.num_classes)
|
133 |
+
|
134 |
+
self.end_conv = nn.Conv2d(ngf, config.data.channels, 3, stride=1, padding=1)
|
135 |
+
|
136 |
+
self.res1 = nn.ModuleList([
|
137 |
+
ResidualBlock(self.ngf, self.ngf, resample=None, act=act,
|
138 |
+
normalization=self.norm),
|
139 |
+
ResidualBlock(self.ngf, self.ngf, resample=None, act=act,
|
140 |
+
normalization=self.norm)]
|
141 |
+
)
|
142 |
+
|
143 |
+
self.res2 = nn.ModuleList([
|
144 |
+
ResidualBlock(self.ngf, 2 * self.ngf, resample='down', act=act,
|
145 |
+
normalization=self.norm),
|
146 |
+
ResidualBlock(2 * self.ngf, 2 * self.ngf, resample=None, act=act,
|
147 |
+
normalization=self.norm)]
|
148 |
+
)
|
149 |
+
|
150 |
+
self.res3 = nn.ModuleList([
|
151 |
+
ResidualBlock(2 * self.ngf, 2 * self.ngf, resample='down', act=act,
|
152 |
+
normalization=self.norm),
|
153 |
+
ResidualBlock(2 * self.ngf, 2 * self.ngf, resample=None, act=act,
|
154 |
+
normalization=self.norm)]
|
155 |
+
)
|
156 |
+
|
157 |
+
self.res4 = nn.ModuleList([
|
158 |
+
ResidualBlock(2 * self.ngf, 4 * self.ngf, resample='down', act=act,
|
159 |
+
normalization=self.norm, dilation=2),
|
160 |
+
ResidualBlock(4 * self.ngf, 4 * self.ngf, resample=None, act=act,
|
161 |
+
normalization=self.norm, dilation=2)]
|
162 |
+
)
|
163 |
+
|
164 |
+
self.res5 = nn.ModuleList([
|
165 |
+
ResidualBlock(4 * self.ngf, 4 * self.ngf, resample='down', act=act,
|
166 |
+
normalization=self.norm, dilation=4),
|
167 |
+
ResidualBlock(4 * self.ngf, 4 * self.ngf, resample=None, act=act,
|
168 |
+
normalization=self.norm, dilation=4)]
|
169 |
+
)
|
170 |
+
|
171 |
+
self.refine1 = RefineBlock([4 * self.ngf], 4 * self.ngf, act=act, start=True)
|
172 |
+
self.refine2 = RefineBlock([4 * self.ngf, 4 * self.ngf], 2 * self.ngf, act=act)
|
173 |
+
self.refine3 = RefineBlock([2 * self.ngf, 2 * self.ngf], 2 * self.ngf, act=act)
|
174 |
+
self.refine4 = RefineBlock([2 * self.ngf, 2 * self.ngf], self.ngf, act=act)
|
175 |
+
self.refine5 = RefineBlock([self.ngf, self.ngf], self.ngf, act=act, end=True)
|
176 |
+
|
177 |
+
def _compute_cond_module(self, module, x):
|
178 |
+
for m in module:
|
179 |
+
x = m(x)
|
180 |
+
return x
|
181 |
+
|
182 |
+
def forward(self, x, y):
|
183 |
+
if not self.logit_transform and not self.rescaled:
|
184 |
+
h = 2 * x - 1.
|
185 |
+
else:
|
186 |
+
h = x
|
187 |
+
|
188 |
+
output = self.begin_conv(h)
|
189 |
+
|
190 |
+
layer1 = self._compute_cond_module(self.res1, output)
|
191 |
+
layer2 = self._compute_cond_module(self.res2, layer1)
|
192 |
+
layer3 = self._compute_cond_module(self.res3, layer2)
|
193 |
+
layer4 = self._compute_cond_module(self.res4, layer3)
|
194 |
+
layer5 = self._compute_cond_module(self.res5, layer4)
|
195 |
+
|
196 |
+
ref1 = self.refine1([layer5], layer5.shape[2:])
|
197 |
+
ref2 = self.refine2([layer4, ref1], layer4.shape[2:])
|
198 |
+
ref3 = self.refine3([layer3, ref2], layer3.shape[2:])
|
199 |
+
ref4 = self.refine4([layer2, ref3], layer2.shape[2:])
|
200 |
+
output = self.refine5([layer1, ref4], layer1.shape[2:])
|
201 |
+
|
202 |
+
output = self.normalizer(output)
|
203 |
+
output = self.act(output)
|
204 |
+
output = self.end_conv(output)
|
205 |
+
|
206 |
+
used_sigmas = self.sigmas[y].view(x.shape[0], *([1] * len(x.shape[1:])))
|
207 |
+
|
208 |
+
output = output / used_sigmas
|
209 |
+
|
210 |
+
return output
|
211 |
+
|
212 |
+
|
213 |
+
class NCSNv2Deepest(nn.Module):
|
214 |
+
def __init__(self, config):
|
215 |
+
super().__init__()
|
216 |
+
self.logit_transform = config.data.logit_transform
|
217 |
+
self.rescaled = config.data.rescaled
|
218 |
+
self.norm = get_normalization(config, conditional=False)
|
219 |
+
self.ngf = ngf = config.model.ngf
|
220 |
+
self.num_classes = config.model.num_classes
|
221 |
+
self.act = act = get_act(config)
|
222 |
+
self.register_buffer('sigmas', get_sigmas(config))
|
223 |
+
self.config = config
|
224 |
+
|
225 |
+
self.begin_conv = nn.Conv2d(config.data.channels, ngf, 3, stride=1, padding=1)
|
226 |
+
self.normalizer = self.norm(ngf, self.num_classes)
|
227 |
+
|
228 |
+
self.end_conv = nn.Conv2d(ngf, config.data.channels, 3, stride=1, padding=1)
|
229 |
+
|
230 |
+
self.res1 = nn.ModuleList([
|
231 |
+
ResidualBlock(self.ngf, self.ngf, resample=None, act=act,
|
232 |
+
normalization=self.norm),
|
233 |
+
ResidualBlock(self.ngf, self.ngf, resample=None, act=act,
|
234 |
+
normalization=self.norm)]
|
235 |
+
)
|
236 |
+
|
237 |
+
self.res2 = nn.ModuleList([
|
238 |
+
ResidualBlock(self.ngf, 2 * self.ngf, resample='down', act=act,
|
239 |
+
normalization=self.norm),
|
240 |
+
ResidualBlock(2 * self.ngf, 2 * self.ngf, resample=None, act=act,
|
241 |
+
normalization=self.norm)]
|
242 |
+
)
|
243 |
+
|
244 |
+
self.res3 = nn.ModuleList([
|
245 |
+
ResidualBlock(2 * self.ngf, 2 * self.ngf, resample='down', act=act,
|
246 |
+
normalization=self.norm),
|
247 |
+
ResidualBlock(2 * self.ngf, 2 * self.ngf, resample=None, act=act,
|
248 |
+
normalization=self.norm)]
|
249 |
+
)
|
250 |
+
|
251 |
+
self.res31 = nn.ModuleList([
|
252 |
+
ResidualBlock(2 * self.ngf, 2 * self.ngf, resample='down', act=act,
|
253 |
+
normalization=self.norm),
|
254 |
+
ResidualBlock(2 * self.ngf, 2 * self.ngf, resample=None, act=act,
|
255 |
+
normalization=self.norm)]
|
256 |
+
)
|
257 |
+
|
258 |
+
self.res4 = nn.ModuleList([
|
259 |
+
ResidualBlock(2 * self.ngf, 4 * self.ngf, resample='down', act=act,
|
260 |
+
normalization=self.norm, dilation=2),
|
261 |
+
ResidualBlock(4 * self.ngf, 4 * self.ngf, resample=None, act=act,
|
262 |
+
normalization=self.norm, dilation=2)]
|
263 |
+
)
|
264 |
+
|
265 |
+
self.res5 = nn.ModuleList([
|
266 |
+
ResidualBlock(4 * self.ngf, 4 * self.ngf, resample='down', act=act,
|
267 |
+
normalization=self.norm, dilation=4),
|
268 |
+
ResidualBlock(4 * self.ngf, 4 * self.ngf, resample=None, act=act,
|
269 |
+
normalization=self.norm, dilation=4)]
|
270 |
+
)
|
271 |
+
|
272 |
+
self.refine1 = RefineBlock([4 * self.ngf], 4 * self.ngf, act=act, start=True)
|
273 |
+
self.refine2 = RefineBlock([4 * self.ngf, 4 * self.ngf], 2 * self.ngf, act=act)
|
274 |
+
self.refine3 = RefineBlock([2 * self.ngf, 2 * self.ngf], 2 * self.ngf, act=act)
|
275 |
+
self.refine31 = RefineBlock([2 * self.ngf, 2 * self.ngf], 2 * self.ngf, act=act)
|
276 |
+
self.refine4 = RefineBlock([2 * self.ngf, 2 * self.ngf], self.ngf, act=act)
|
277 |
+
self.refine5 = RefineBlock([self.ngf, self.ngf], self.ngf, act=act, end=True)
|
278 |
+
|
279 |
+
def _compute_cond_module(self, module, x):
|
280 |
+
for m in module:
|
281 |
+
x = m(x)
|
282 |
+
return x
|
283 |
+
|
284 |
+
def forward(self, x, y):
|
285 |
+
if not self.logit_transform and not self.rescaled:
|
286 |
+
h = 2 * x - 1.
|
287 |
+
else:
|
288 |
+
h = x
|
289 |
+
|
290 |
+
output = self.begin_conv(h)
|
291 |
+
|
292 |
+
layer1 = self._compute_cond_module(self.res1, output)
|
293 |
+
layer2 = self._compute_cond_module(self.res2, layer1)
|
294 |
+
layer3 = self._compute_cond_module(self.res3, layer2)
|
295 |
+
layer31 = self._compute_cond_module(self.res31, layer3)
|
296 |
+
layer4 = self._compute_cond_module(self.res4, layer31)
|
297 |
+
layer5 = self._compute_cond_module(self.res5, layer4)
|
298 |
+
|
299 |
+
ref1 = self.refine1([layer5], layer5.shape[2:])
|
300 |
+
ref2 = self.refine2([layer4, ref1], layer4.shape[2:])
|
301 |
+
ref31 = self.refine31([layer31, ref2], layer31.shape[2:])
|
302 |
+
ref3 = self.refine3([layer3, ref31], layer3.shape[2:])
|
303 |
+
ref4 = self.refine4([layer2, ref3], layer2.shape[2:])
|
304 |
+
output = self.refine5([layer1, ref4], layer1.shape[2:])
|
305 |
+
|
306 |
+
output = self.normalizer(output)
|
307 |
+
output = self.act(output)
|
308 |
+
output = self.end_conv(output)
|
309 |
+
|
310 |
+
used_sigmas = self.sigmas[y].view(x.shape[0], *([1] * len(x.shape[1:])))
|
311 |
+
|
312 |
+
output = output / used_sigmas
|
313 |
+
|
314 |
+
return output
|
ncsn/normalization.py
ADDED
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
|
4 |
+
|
5 |
+
def get_normalization(config, conditional=True):
|
6 |
+
norm = config.model.normalization
|
7 |
+
if conditional:
|
8 |
+
if norm == 'NoneNorm':
|
9 |
+
return ConditionalNoneNorm2d
|
10 |
+
elif norm == 'InstanceNorm++':
|
11 |
+
return ConditionalInstanceNorm2dPlus
|
12 |
+
elif norm == 'InstanceNorm':
|
13 |
+
return ConditionalInstanceNorm2d
|
14 |
+
elif norm == 'BatchNorm':
|
15 |
+
return ConditionalBatchNorm2d
|
16 |
+
elif norm == 'VarianceNorm':
|
17 |
+
return ConditionalVarianceNorm2d
|
18 |
+
else:
|
19 |
+
raise NotImplementedError("{} does not exist!".format(norm))
|
20 |
+
else:
|
21 |
+
if norm == 'BatchNorm':
|
22 |
+
return nn.BatchNorm2d
|
23 |
+
elif norm == 'InstanceNorm':
|
24 |
+
return nn.InstanceNorm2d
|
25 |
+
elif norm == 'InstanceNorm++':
|
26 |
+
return InstanceNorm2dPlus
|
27 |
+
elif norm == 'VarianceNorm':
|
28 |
+
return VarianceNorm2d
|
29 |
+
elif norm == 'NoneNorm':
|
30 |
+
return NoneNorm2d
|
31 |
+
elif norm is None:
|
32 |
+
return None
|
33 |
+
else:
|
34 |
+
raise NotImplementedError("{} does not exist!".format(norm))
|
35 |
+
|
36 |
+
class ConditionalBatchNorm2d(nn.Module):
|
37 |
+
def __init__(self, num_features, num_classes, bias=True):
|
38 |
+
super().__init__()
|
39 |
+
self.num_features = num_features
|
40 |
+
self.bias = bias
|
41 |
+
self.bn = nn.BatchNorm2d(num_features, affine=False)
|
42 |
+
if self.bias:
|
43 |
+
self.embed = nn.Embedding(num_classes, num_features * 2)
|
44 |
+
self.embed.weight.data[:, :num_features].uniform_() # Initialise scale at N(1, 0.02)
|
45 |
+
self.embed.weight.data[:, num_features:].zero_() # Initialise bias at 0
|
46 |
+
else:
|
47 |
+
self.embed = nn.Embedding(num_classes, num_features)
|
48 |
+
self.embed.weight.data.uniform_()
|
49 |
+
|
50 |
+
def forward(self, x, y):
|
51 |
+
out = self.bn(x)
|
52 |
+
if self.bias:
|
53 |
+
gamma, beta = self.embed(y).chunk(2, dim=1)
|
54 |
+
out = gamma.view(-1, self.num_features, 1, 1) * out + beta.view(-1, self.num_features, 1, 1)
|
55 |
+
else:
|
56 |
+
gamma = self.embed(y)
|
57 |
+
out = gamma.view(-1, self.num_features, 1, 1) * out
|
58 |
+
return out
|
59 |
+
|
60 |
+
|
61 |
+
class ConditionalInstanceNorm2d(nn.Module):
|
62 |
+
def __init__(self, num_features, num_classes, bias=True):
|
63 |
+
super().__init__()
|
64 |
+
self.num_features = num_features
|
65 |
+
self.bias = bias
|
66 |
+
self.instance_norm = nn.InstanceNorm2d(num_features, affine=False, track_running_stats=False)
|
67 |
+
if bias:
|
68 |
+
self.embed = nn.Embedding(num_classes, num_features * 2)
|
69 |
+
self.embed.weight.data[:, :num_features].uniform_() # Initialise scale at N(1, 0.02)
|
70 |
+
self.embed.weight.data[:, num_features:].zero_() # Initialise bias at 0
|
71 |
+
else:
|
72 |
+
self.embed = nn.Embedding(num_classes, num_features)
|
73 |
+
self.embed.weight.data.uniform_()
|
74 |
+
|
75 |
+
def forward(self, x, y):
|
76 |
+
h = self.instance_norm(x)
|
77 |
+
if self.bias:
|
78 |
+
gamma, beta = self.embed(y).chunk(2, dim=-1)
|
79 |
+
out = gamma.view(-1, self.num_features, 1, 1) * h + beta.view(-1, self.num_features, 1, 1)
|
80 |
+
else:
|
81 |
+
gamma = self.embed(y)
|
82 |
+
out = gamma.view(-1, self.num_features, 1, 1) * h
|
83 |
+
return out
|
84 |
+
|
85 |
+
|
86 |
+
class ConditionalVarianceNorm2d(nn.Module):
|
87 |
+
def __init__(self, num_features, num_classes, bias=False):
|
88 |
+
super().__init__()
|
89 |
+
self.num_features = num_features
|
90 |
+
self.bias = bias
|
91 |
+
self.embed = nn.Embedding(num_classes, num_features)
|
92 |
+
self.embed.weight.data.normal_(1, 0.02)
|
93 |
+
|
94 |
+
def forward(self, x, y):
|
95 |
+
vars = torch.var(x, dim=(2, 3), keepdim=True)
|
96 |
+
h = x / torch.sqrt(vars + 1e-5)
|
97 |
+
|
98 |
+
gamma = self.embed(y)
|
99 |
+
out = gamma.view(-1, self.num_features, 1, 1) * h
|
100 |
+
return out
|
101 |
+
|
102 |
+
|
103 |
+
class VarianceNorm2d(nn.Module):
|
104 |
+
def __init__(self, num_features, bias=False):
|
105 |
+
super().__init__()
|
106 |
+
self.num_features = num_features
|
107 |
+
self.bias = bias
|
108 |
+
self.alpha = nn.Parameter(torch.zeros(num_features))
|
109 |
+
self.alpha.data.normal_(1, 0.02)
|
110 |
+
|
111 |
+
def forward(self, x):
|
112 |
+
vars = torch.var(x, dim=(2, 3), keepdim=True)
|
113 |
+
h = x / torch.sqrt(vars + 1e-5)
|
114 |
+
|
115 |
+
out = self.alpha.view(-1, self.num_features, 1, 1) * h
|
116 |
+
return out
|
117 |
+
|
118 |
+
|
119 |
+
class ConditionalNoneNorm2d(nn.Module):
|
120 |
+
def __init__(self, num_features, num_classes, bias=True):
|
121 |
+
super().__init__()
|
122 |
+
self.num_features = num_features
|
123 |
+
self.bias = bias
|
124 |
+
if bias:
|
125 |
+
self.embed = nn.Embedding(num_classes, num_features * 2)
|
126 |
+
self.embed.weight.data[:, :num_features].uniform_() # Initialise scale at N(1, 0.02)
|
127 |
+
self.embed.weight.data[:, num_features:].zero_() # Initialise bias at 0
|
128 |
+
else:
|
129 |
+
self.embed = nn.Embedding(num_classes, num_features)
|
130 |
+
self.embed.weight.data.uniform_()
|
131 |
+
|
132 |
+
def forward(self, x, y):
|
133 |
+
if self.bias:
|
134 |
+
gamma, beta = self.embed(y).chunk(2, dim=-1)
|
135 |
+
out = gamma.view(-1, self.num_features, 1, 1) * x + beta.view(-1, self.num_features, 1, 1)
|
136 |
+
else:
|
137 |
+
gamma = self.embed(y)
|
138 |
+
out = gamma.view(-1, self.num_features, 1, 1) * x
|
139 |
+
return out
|
140 |
+
|
141 |
+
|
142 |
+
class NoneNorm2d(nn.Module):
|
143 |
+
def __init__(self, num_features, bias=True):
|
144 |
+
super().__init__()
|
145 |
+
|
146 |
+
def forward(self, x):
|
147 |
+
return x
|
148 |
+
|
149 |
+
|
150 |
+
class InstanceNorm2dPlus(nn.Module):
|
151 |
+
def __init__(self, num_features, bias=True):
|
152 |
+
super().__init__()
|
153 |
+
self.num_features = num_features
|
154 |
+
self.bias = bias
|
155 |
+
self.instance_norm = nn.InstanceNorm2d(num_features, affine=False, track_running_stats=False)
|
156 |
+
self.alpha = nn.Parameter(torch.zeros(num_features))
|
157 |
+
self.gamma = nn.Parameter(torch.zeros(num_features))
|
158 |
+
self.alpha.data.normal_(1, 0.02)
|
159 |
+
self.gamma.data.normal_(1, 0.02)
|
160 |
+
if bias:
|
161 |
+
self.beta = nn.Parameter(torch.zeros(num_features))
|
162 |
+
|
163 |
+
def forward(self, x):
|
164 |
+
means = torch.mean(x, dim=(2, 3))
|
165 |
+
m = torch.mean(means, dim=-1, keepdim=True)
|
166 |
+
v = torch.var(means, dim=-1, keepdim=True)
|
167 |
+
means = (means - m) / (torch.sqrt(v + 1e-5))
|
168 |
+
h = self.instance_norm(x)
|
169 |
+
|
170 |
+
if self.bias:
|
171 |
+
h = h + means[..., None, None] * self.alpha[..., None, None]
|
172 |
+
out = self.gamma.view(-1, self.num_features, 1, 1) * h + self.beta.view(-1, self.num_features, 1, 1)
|
173 |
+
else:
|
174 |
+
h = h + means[..., None, None] * self.alpha[..., None, None]
|
175 |
+
out = self.gamma.view(-1, self.num_features, 1, 1) * h
|
176 |
+
return out
|
177 |
+
|
178 |
+
|
179 |
+
class ConditionalInstanceNorm2dPlus(nn.Module):
|
180 |
+
def __init__(self, num_features, num_classes, bias=True):
|
181 |
+
super().__init__()
|
182 |
+
self.num_features = num_features
|
183 |
+
self.bias = bias
|
184 |
+
self.instance_norm = nn.InstanceNorm2d(num_features, affine=False, track_running_stats=False)
|
185 |
+
if bias:
|
186 |
+
self.embed = nn.Embedding(num_classes, num_features * 3)
|
187 |
+
self.embed.weight.data[:, :2 * num_features].normal_(1, 0.02) # Initialise scale at N(1, 0.02)
|
188 |
+
self.embed.weight.data[:, 2 * num_features:].zero_() # Initialise bias at 0
|
189 |
+
else:
|
190 |
+
self.embed = nn.Embedding(num_classes, 2 * num_features)
|
191 |
+
self.embed.weight.data.normal_(1, 0.02)
|
192 |
+
|
193 |
+
def forward(self, x, y):
|
194 |
+
means = torch.mean(x, dim=(2, 3))
|
195 |
+
m = torch.mean(means, dim=-1, keepdim=True)
|
196 |
+
v = torch.var(means, dim=-1, keepdim=True)
|
197 |
+
means = (means - m) / (torch.sqrt(v + 1e-5))
|
198 |
+
h = self.instance_norm(x)
|
199 |
+
|
200 |
+
if self.bias:
|
201 |
+
gamma, alpha, beta = self.embed(y).chunk(3, dim=-1)
|
202 |
+
h = h + means[..., None, None] * alpha[..., None, None]
|
203 |
+
out = gamma.view(-1, self.num_features, 1, 1) * h + beta.view(-1, self.num_features, 1, 1)
|
204 |
+
else:
|
205 |
+
gamma, alpha = self.embed(y).chunk(2, dim=-1)
|
206 |
+
h = h + means[..., None, None] * alpha[..., None, None]
|
207 |
+
out = gamma.view(-1, self.num_features, 1, 1) * h
|
208 |
+
return out
|
pose.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from numpy import sin, cos
|
3 |
+
from math import pi as π
|
4 |
+
from my3d import camera_pose
|
5 |
+
from my.config import BaseConf
|
6 |
+
import random
|
7 |
+
|
8 |
+
|
9 |
+
def get_K(H, W, FoV_x):
|
10 |
+
FoV_x = FoV_x / 180 * π # to rad
|
11 |
+
f = 1 / np.tan(FoV_x / 2) * (W / 2)
|
12 |
+
|
13 |
+
K = np.array([
|
14 |
+
[f, 0, -(W/2 - 0.5)],
|
15 |
+
[0, -f, -(H/2 - 0.5)],
|
16 |
+
[0, 0, -1]
|
17 |
+
])
|
18 |
+
return K
|
19 |
+
|
20 |
+
|
21 |
+
SIDEVIEW_PROMPTS = [
|
22 |
+
"front view of", "side view of", "backside view of", "side view of"
|
23 |
+
]
|
24 |
+
|
25 |
+
TOPVIEW_PROMPT = "overhead view of"
|
26 |
+
|
27 |
+
|
28 |
+
def train_eye_with_prompts(r, n):
|
29 |
+
hs = np.random.rand(n) * 360
|
30 |
+
vs = np.random.rand(n) * np.deg2rad(100)
|
31 |
+
vs = np.clip(vs, 1e-2, π-1e-2)
|
32 |
+
|
33 |
+
prompts = []
|
34 |
+
v_thresh = np.deg2rad(30)
|
35 |
+
for i in range(n):
|
36 |
+
_p = ""
|
37 |
+
if vs[i] < v_thresh:
|
38 |
+
_p = TOPVIEW_PROMPT
|
39 |
+
else:
|
40 |
+
_a = hs[i]
|
41 |
+
_a = (_a + 45) % 360
|
42 |
+
_quad = int(_a // 90)
|
43 |
+
_p = SIDEVIEW_PROMPTS[_quad]
|
44 |
+
prompts.append(_p)
|
45 |
+
|
46 |
+
θ = np.deg2rad(hs)
|
47 |
+
# φ = v
|
48 |
+
φ = np.arccos(1 - 2 * (vs / π))
|
49 |
+
|
50 |
+
eyes = np.zeros((n, 3))
|
51 |
+
|
52 |
+
eyes[:, 0] = r * sin(φ) * cos(π-θ) # x
|
53 |
+
eyes[:, 2] = r * sin(φ) * sin(π-θ) # z
|
54 |
+
eyes[:, 1] = r * cos(φ) # y
|
55 |
+
|
56 |
+
return eyes, prompts
|
57 |
+
|
58 |
+
|
59 |
+
def spiral_poses(
|
60 |
+
radius, height,
|
61 |
+
num_steps=20, num_rounds=1,
|
62 |
+
center=np.array([0, 0, 0]), up=np.array([0, 1, 0]),
|
63 |
+
):
|
64 |
+
eyes = []
|
65 |
+
for i in range(num_steps):
|
66 |
+
ratio = (i + 1) / num_steps
|
67 |
+
Δy = height * (1 - ratio)
|
68 |
+
|
69 |
+
θ = ratio * (360 * num_rounds)
|
70 |
+
θ = θ / 180 * π
|
71 |
+
# _r = max(radius * ratio, 0.5)
|
72 |
+
_r = max(radius * sin(ratio * π / 2), 0.5)
|
73 |
+
Δx, Δz = _r * np.array([np.cos(θ), np.sin(θ)])
|
74 |
+
eyes.append(center + [Δx, Δy, Δz])
|
75 |
+
|
76 |
+
poses = [
|
77 |
+
camera_pose(e, center - e, up) for e in eyes
|
78 |
+
]
|
79 |
+
return poses
|
80 |
+
|
81 |
+
|
82 |
+
class PoseConfig(BaseConf):
|
83 |
+
rend_hw: int = 64
|
84 |
+
FoV: float = 60.0
|
85 |
+
R: float = 1.5
|
86 |
+
|
87 |
+
def make(self):
|
88 |
+
cfgs = self.dict()
|
89 |
+
hw = cfgs.pop("rend_hw")
|
90 |
+
cfgs["H"] = hw
|
91 |
+
cfgs["W"] = hw
|
92 |
+
return Poser(**cfgs)
|
93 |
+
|
94 |
+
|
95 |
+
class Poser():
|
96 |
+
def __init__(self, H, W, FoV, R):
|
97 |
+
self.H, self.W = H, W
|
98 |
+
self.R = R
|
99 |
+
self.K = get_K(H, W, FoV)
|
100 |
+
|
101 |
+
def sample_train(self, n):
|
102 |
+
eyes, prompts = train_eye_with_prompts(r=self.R, n=n)
|
103 |
+
up = np.array([0, 1, 0])
|
104 |
+
poses = [
|
105 |
+
camera_pose(e, -e, up) for e in eyes
|
106 |
+
]
|
107 |
+
poses = np.stack(poses, 0)
|
108 |
+
# FoV during training: [40,70]
|
109 |
+
random_Ks = [
|
110 |
+
get_K(self.H, self.W, random.random() * 30 + 40)
|
111 |
+
for i in range(len(poses))
|
112 |
+
# self.K for i in range(len(poses))
|
113 |
+
]
|
114 |
+
# return self.K, poses, prompts
|
115 |
+
return random_Ks, poses, prompts
|
116 |
+
|
117 |
+
def sample_test(self, n):
|
118 |
+
poses = spiral_poses(self.R, self.R, n, num_rounds=3)
|
119 |
+
poses = np.stack(poses, axis=0)
|
120 |
+
return self.K, poses
|
release/diffusion_ckpts/guided_ddpm/models/lsun_bedroom.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f9faf136dc2375dcdb392b35cee9ca9dca1fd5257b2f3358613136395ec39231
|
3 |
+
size 2211383297
|
release/diffusion_ckpts/guided_ddpm/models/lsun_ffhq.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e409993ae12fc4cb8cd61aba7352c1bc0af0735e2debdd4b3c609280c8dc448b
|
3 |
+
size 2211370791
|
release/diffusion_ckpts/stable_diffusion/sd-v1-5.ckpt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e1441589a6f3c5a53f5f54d0975a18a7feb7cdf0b0dee276dfc3331ae376a053
|
3 |
+
size 7703807346
|
requirements.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pydantic
|
2 |
+
tqdm
|
3 |
+
click
|
4 |
+
easydict
|
5 |
+
tabulate
|
6 |
+
imageio
|
7 |
+
einops
|
8 |
+
matplotlib
|
9 |
+
omegaconf==2.1.1
|
10 |
+
torchmetrics==0.6.0
|
11 |
+
pytorch-lightning==1.4.2
|
12 |
+
transformers
|
13 |
+
kornia==0.6.0
|
14 |
+
git+https:///github.com/openai/CLIP.git#egg=clip
|
15 |
+
imageio[ffmpeg]
|
16 |
+
imageio[pyav]
|
run_img_sampling.py
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
|
5 |
+
from misc import torch_samps_to_imgs
|
6 |
+
from adapt import Karras, ScoreAdapter, power_schedule
|
7 |
+
from adapt_gddpm import GuidedDDPM
|
8 |
+
from adapt_ncsn import NCSN as _NCSN
|
9 |
+
# from adapt_vesde import VESDE # not included to prevent import conflicts
|
10 |
+
from adapt_sd import StableDiffusion
|
11 |
+
|
12 |
+
from my.utils import tqdm, EventStorage, HeartBeat, EarlyLoopBreak
|
13 |
+
from my.config import BaseConf, dispatch
|
14 |
+
from my.utils.seed import seed_everything
|
15 |
+
|
16 |
+
|
17 |
+
class GDDPM(BaseConf):
|
18 |
+
"""Guided DDPM from OpenAI"""
|
19 |
+
model: str = "m_lsun_256"
|
20 |
+
lsun_cat: str = "bedroom"
|
21 |
+
imgnet_cat: int = -1
|
22 |
+
|
23 |
+
def make(self):
|
24 |
+
args = self.dict()
|
25 |
+
model = GuidedDDPM(**args)
|
26 |
+
return model
|
27 |
+
|
28 |
+
|
29 |
+
class SD(BaseConf):
|
30 |
+
"""Stable Diffusion"""
|
31 |
+
variant: str = "v1"
|
32 |
+
v2_highres: bool = False
|
33 |
+
prompt: str = "a photograph of an astronaut riding a horse"
|
34 |
+
scale: float = 3.0 # classifier free guidance scale
|
35 |
+
precision: str = 'autocast'
|
36 |
+
|
37 |
+
def make(self):
|
38 |
+
args = self.dict()
|
39 |
+
model = StableDiffusion(**args)
|
40 |
+
return model
|
41 |
+
|
42 |
+
|
43 |
+
class SDE(BaseConf):
|
44 |
+
def make(self):
|
45 |
+
args = self.dict()
|
46 |
+
model = VESDE(**args)
|
47 |
+
return model
|
48 |
+
|
49 |
+
|
50 |
+
class NCSN(BaseConf):
|
51 |
+
def make(self):
|
52 |
+
args = self.dict()
|
53 |
+
model = _NCSN(**args)
|
54 |
+
return model
|
55 |
+
|
56 |
+
|
57 |
+
class KarrasGen(BaseConf):
|
58 |
+
family: str = "gddpm"
|
59 |
+
gddpm: GDDPM = GDDPM()
|
60 |
+
sd: SD = SD()
|
61 |
+
# sde: SDE = SDE()
|
62 |
+
ncsn: NCSN = NCSN()
|
63 |
+
|
64 |
+
batch_size: int = 10
|
65 |
+
num_images: int = 1250
|
66 |
+
num_t: int = 40
|
67 |
+
σ_max: float = 80.0
|
68 |
+
heun: bool = True
|
69 |
+
langevin: bool = False
|
70 |
+
cls_scaling: float = 1.0 # classifier guidance scaling
|
71 |
+
|
72 |
+
def run(self):
|
73 |
+
args = self.dict()
|
74 |
+
family = args.pop("family")
|
75 |
+
model = getattr(self, family).make()
|
76 |
+
self.karras_generate(model, **args)
|
77 |
+
|
78 |
+
@staticmethod
|
79 |
+
def karras_generate(
|
80 |
+
model: ScoreAdapter,
|
81 |
+
batch_size, num_images, σ_max, num_t, langevin, heun, cls_scaling,
|
82 |
+
**kwargs
|
83 |
+
):
|
84 |
+
del kwargs # removed extra args
|
85 |
+
num_batches = num_images // batch_size
|
86 |
+
|
87 |
+
fuse = EarlyLoopBreak(5)
|
88 |
+
with tqdm(total=num_batches) as pbar, \
|
89 |
+
HeartBeat(pbar) as hbeat, \
|
90 |
+
EventStorage() as metric:
|
91 |
+
|
92 |
+
all_imgs = []
|
93 |
+
|
94 |
+
for _ in range(num_batches):
|
95 |
+
if fuse.on_break():
|
96 |
+
break
|
97 |
+
|
98 |
+
pipeline = Karras.inference(
|
99 |
+
model, batch_size, num_t,
|
100 |
+
init_xs=None, heun=heun, σ_max=σ_max,
|
101 |
+
langevin=langevin, cls_scaling=cls_scaling
|
102 |
+
)
|
103 |
+
|
104 |
+
for imgs in tqdm(pipeline, total=num_t+1, disable=False):
|
105 |
+
# _std = imgs.std().item()
|
106 |
+
# print(_std)
|
107 |
+
hbeat.beat()
|
108 |
+
pass
|
109 |
+
|
110 |
+
if isinstance(model, StableDiffusion):
|
111 |
+
imgs = model.decode(imgs)
|
112 |
+
|
113 |
+
imgs = torch_samps_to_imgs(imgs, uncenter=model.samps_centered())
|
114 |
+
all_imgs.append(imgs)
|
115 |
+
|
116 |
+
pbar.update()
|
117 |
+
|
118 |
+
all_imgs = np.concatenate(all_imgs, axis=0)
|
119 |
+
metric.put_artifact("imgs", ".npy", lambda fn: np.save(fn, all_imgs))
|
120 |
+
metric.step()
|
121 |
+
hbeat.done()
|
122 |
+
|
123 |
+
|
124 |
+
class SMLDGen(BaseConf):
|
125 |
+
family: str = "ncsn"
|
126 |
+
gddpm: GDDPM = GDDPM()
|
127 |
+
# sde: SDE = SDE()
|
128 |
+
ncsn: NCSN = NCSN()
|
129 |
+
|
130 |
+
batch_size: int = 16
|
131 |
+
num_images: int = 16
|
132 |
+
num_stages: int = 80
|
133 |
+
num_steps: int = 15
|
134 |
+
σ_max: float = 80.0
|
135 |
+
ε: float = 1e-5
|
136 |
+
|
137 |
+
def run(self):
|
138 |
+
args = self.dict()
|
139 |
+
family = args.pop("family")
|
140 |
+
model = getattr(self, family).make()
|
141 |
+
self.smld_generate(model, **args)
|
142 |
+
|
143 |
+
@staticmethod
|
144 |
+
def smld_generate(
|
145 |
+
model: ScoreAdapter,
|
146 |
+
batch_size, num_images, num_stages, num_steps, σ_max, ε,
|
147 |
+
**kwargs
|
148 |
+
):
|
149 |
+
num_batches = num_images // batch_size
|
150 |
+
σs = power_schedule(σ_max, model.σ_min, num_stages)
|
151 |
+
σs = [model.snap_t_to_nearest_tick(σ)[0] for σ in σs]
|
152 |
+
|
153 |
+
fuse = EarlyLoopBreak(5)
|
154 |
+
with tqdm(total=num_batches) as pbar, \
|
155 |
+
HeartBeat(pbar) as hbeat, \
|
156 |
+
EventStorage() as metric:
|
157 |
+
|
158 |
+
all_imgs = []
|
159 |
+
|
160 |
+
for _ in range(num_batches):
|
161 |
+
if fuse.on_break():
|
162 |
+
break
|
163 |
+
|
164 |
+
init_xs = torch.rand(batch_size, *model.data_shape(), device=model.device)
|
165 |
+
if model.samps_centered():
|
166 |
+
init_xs = init_xs * 2 - 1 # [0, 1] -> [-1, 1]
|
167 |
+
|
168 |
+
pipeline = smld_inference(
|
169 |
+
model, σs, num_steps, ε, init_xs
|
170 |
+
)
|
171 |
+
|
172 |
+
for imgs in tqdm(pipeline, total=(num_stages * num_steps)+1, disable=False):
|
173 |
+
pbar.set_description(f"{imgs.max().item():.3f}")
|
174 |
+
metric.put_scalars(
|
175 |
+
max=imgs.max().item(), min=imgs.min().item(), std=imgs.std().item()
|
176 |
+
)
|
177 |
+
metric.step()
|
178 |
+
hbeat.beat()
|
179 |
+
|
180 |
+
pbar.update()
|
181 |
+
imgs = torch_samps_to_imgs(imgs, uncenter=model.samps_centered())
|
182 |
+
all_imgs.append(imgs)
|
183 |
+
|
184 |
+
all_imgs = np.concatenate(all_imgs, axis=0)
|
185 |
+
metric.put_artifact("imgs", ".npy", lambda fn: np.save(fn, all_imgs))
|
186 |
+
metric.step()
|
187 |
+
hbeat.done()
|
188 |
+
|
189 |
+
|
190 |
+
def smld_inference(model, σs, num_steps, ε, init_xs):
|
191 |
+
from math import sqrt
|
192 |
+
# not doing conditioning or cls guidance; for gddpm only lsun works; fine.
|
193 |
+
|
194 |
+
xs = init_xs
|
195 |
+
yield xs
|
196 |
+
|
197 |
+
for i in range(len(σs)):
|
198 |
+
α_i = ε * ((σs[i] / σs[-1]) ** 2)
|
199 |
+
for _ in range(num_steps):
|
200 |
+
grad = model.score(xs, σs[i])
|
201 |
+
z = torch.randn_like(xs)
|
202 |
+
xs = xs + α_i * grad + sqrt(2 * α_i) * z
|
203 |
+
yield xs
|
204 |
+
|
205 |
+
|
206 |
+
def load_np_imgs(fname):
|
207 |
+
fname = Path(fname)
|
208 |
+
data = np.load(fname)
|
209 |
+
if fname.suffix == ".npz":
|
210 |
+
imgs = data['arr_0']
|
211 |
+
else:
|
212 |
+
imgs = data
|
213 |
+
return imgs
|
214 |
+
|
215 |
+
|
216 |
+
def visualize(max_n_imgs=16):
|
217 |
+
import torchvision.utils as vutils
|
218 |
+
from imageio import imwrite
|
219 |
+
from einops import rearrange
|
220 |
+
|
221 |
+
all_imgs = load_np_imgs("imgs/step_0.npy")
|
222 |
+
|
223 |
+
imgs = all_imgs[:max_n_imgs]
|
224 |
+
imgs = rearrange(imgs, "N H W C -> N C H W", C=3)
|
225 |
+
imgs = torch.from_numpy(imgs)
|
226 |
+
pane = vutils.make_grid(imgs, padding=2, nrow=4)
|
227 |
+
pane = rearrange(pane, "C H W -> H W C", C=3)
|
228 |
+
pane = pane.numpy()
|
229 |
+
imwrite("preview.jpg", pane)
|
230 |
+
|
231 |
+
|
232 |
+
if __name__ == "__main__":
|
233 |
+
seed_everything(0)
|
234 |
+
dispatch(KarrasGen)
|
235 |
+
visualize(16)
|
run_nerf.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
from pydantic import validator
|
3 |
+
|
4 |
+
from my.config import BaseConf, SingleOrList, dispatch
|
5 |
+
from my.utils.seed import seed_everything
|
6 |
+
|
7 |
+
import numpy as np
|
8 |
+
from voxnerf.vox import VOXRF_REGISTRY
|
9 |
+
from voxnerf.pipelines import train
|
10 |
+
|
11 |
+
|
12 |
+
class VoxConfig(BaseConf):
|
13 |
+
model_type: str = "VoxRF"
|
14 |
+
bbox_len: float = 1.5
|
15 |
+
grid_size: SingleOrList(int) = [128, 128, 128]
|
16 |
+
step_ratio: float = 0.5
|
17 |
+
density_shift: float = -10.
|
18 |
+
ray_march_weight_thres: float = 0.0001
|
19 |
+
c: int = 3
|
20 |
+
blend_bg_texture: bool = False
|
21 |
+
bg_texture_hw: int = 64
|
22 |
+
|
23 |
+
@validator("grid_size")
|
24 |
+
def check_gsize(cls, grid_size):
|
25 |
+
if isinstance(grid_size, int):
|
26 |
+
return [grid_size, ] * 3
|
27 |
+
else:
|
28 |
+
assert len(grid_size) == 3
|
29 |
+
return grid_size
|
30 |
+
|
31 |
+
def make(self):
|
32 |
+
params = self.dict()
|
33 |
+
m_type = params.pop("model_type")
|
34 |
+
model_fn = VOXRF_REGISTRY.get(m_type)
|
35 |
+
|
36 |
+
radius = params.pop('bbox_len')
|
37 |
+
aabb = radius * np.array([
|
38 |
+
[-1, -1, -1],
|
39 |
+
[1, 1, 1]
|
40 |
+
])
|
41 |
+
model = model_fn(aabb=aabb, **params)
|
42 |
+
return model
|
43 |
+
|
44 |
+
|
45 |
+
class TrainerConfig(BaseConf):
|
46 |
+
model: VoxConfig = VoxConfig()
|
47 |
+
scene: str = "lego"
|
48 |
+
n_epoch: int = 2
|
49 |
+
bs: int = 4096
|
50 |
+
lr: float = 0.02
|
51 |
+
|
52 |
+
def run(self):
|
53 |
+
args = self.dict()
|
54 |
+
args.pop("model")
|
55 |
+
|
56 |
+
model = self.model.make()
|
57 |
+
train(model, **args)
|
58 |
+
|
59 |
+
|
60 |
+
if __name__ == "__main__":
|
61 |
+
seed_everything(0)
|
62 |
+
dispatch(TrainerConfig)
|
run_sjc.py
ADDED
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
from einops import rearrange
|
6 |
+
from imageio import imwrite
|
7 |
+
from pydantic import validator
|
8 |
+
|
9 |
+
from my.utils import (
|
10 |
+
tqdm, EventStorage, HeartBeat, EarlyLoopBreak,
|
11 |
+
get_event_storage, get_heartbeat, read_stats
|
12 |
+
)
|
13 |
+
from my.config import BaseConf, dispatch, optional_load_config
|
14 |
+
from my.utils.seed import seed_everything
|
15 |
+
|
16 |
+
from adapt import ScoreAdapter, karras_t_schedule
|
17 |
+
from run_img_sampling import GDDPM, SD, StableDiffusion
|
18 |
+
from misc import torch_samps_to_imgs
|
19 |
+
from pose import PoseConfig
|
20 |
+
|
21 |
+
from run_nerf import VoxConfig
|
22 |
+
from voxnerf.utils import every
|
23 |
+
from voxnerf.render import (
|
24 |
+
as_torch_tsrs, rays_from_img, ray_box_intersect, render_ray_bundle
|
25 |
+
)
|
26 |
+
from voxnerf.vis import stitch_vis, bad_vis as nerf_vis
|
27 |
+
|
28 |
+
|
29 |
+
device_glb = torch.device("cuda")
|
30 |
+
|
31 |
+
|
32 |
+
def tsr_stats(tsr):
|
33 |
+
return {
|
34 |
+
"mean": tsr.mean().item(),
|
35 |
+
"std": tsr.std().item(),
|
36 |
+
"max": tsr.max().item(),
|
37 |
+
}
|
38 |
+
|
39 |
+
|
40 |
+
class SJC(BaseConf):
|
41 |
+
family: str = "sd"
|
42 |
+
gddpm: GDDPM = GDDPM()
|
43 |
+
sd: SD = SD(
|
44 |
+
variant="v1",
|
45 |
+
prompt="A high quality photo of a delicious burger",
|
46 |
+
scale=100.0
|
47 |
+
)
|
48 |
+
lr: float = 0.05
|
49 |
+
n_steps: int = 10000
|
50 |
+
vox: VoxConfig = VoxConfig(
|
51 |
+
model_type="V_SD", grid_size=100, density_shift=-1.0, c=3,
|
52 |
+
blend_bg_texture=True, bg_texture_hw=4,
|
53 |
+
bbox_len=1.0
|
54 |
+
)
|
55 |
+
pose: PoseConfig = PoseConfig(rend_hw=64, FoV=60.0, R=1.5)
|
56 |
+
|
57 |
+
emptiness_scale: int = 10
|
58 |
+
emptiness_weight: int = 1e4
|
59 |
+
emptiness_step: float = 0.5
|
60 |
+
emptiness_multiplier: float = 20.0
|
61 |
+
|
62 |
+
depth_weight: int = 0
|
63 |
+
|
64 |
+
var_red: bool = True
|
65 |
+
|
66 |
+
@validator("vox")
|
67 |
+
def check_vox(cls, vox_cfg, values):
|
68 |
+
family = values['family']
|
69 |
+
if family == "sd":
|
70 |
+
vox_cfg.c = 4
|
71 |
+
return vox_cfg
|
72 |
+
|
73 |
+
def run(self):
|
74 |
+
cfgs = self.dict()
|
75 |
+
|
76 |
+
family = cfgs.pop("family")
|
77 |
+
model = getattr(self, family).make()
|
78 |
+
|
79 |
+
cfgs.pop("vox")
|
80 |
+
vox = self.vox.make()
|
81 |
+
|
82 |
+
cfgs.pop("pose")
|
83 |
+
poser = self.pose.make()
|
84 |
+
|
85 |
+
sjc_3d(**cfgs, poser=poser, model=model, vox=vox)
|
86 |
+
|
87 |
+
|
88 |
+
def sjc_3d(
|
89 |
+
poser, vox, model: ScoreAdapter,
|
90 |
+
lr, n_steps, emptiness_scale, emptiness_weight, emptiness_step, emptiness_multiplier,
|
91 |
+
depth_weight, var_red, **kwargs
|
92 |
+
):
|
93 |
+
del kwargs
|
94 |
+
|
95 |
+
assert model.samps_centered()
|
96 |
+
_, target_H, target_W = model.data_shape()
|
97 |
+
bs = 1
|
98 |
+
aabb = vox.aabb.T.cpu().numpy()
|
99 |
+
vox = vox.to(device_glb)
|
100 |
+
opt = torch.optim.Adamax(vox.opt_params(), lr=lr)
|
101 |
+
|
102 |
+
H, W = poser.H, poser.W
|
103 |
+
Ks, poses, prompt_prefixes = poser.sample_train(n_steps)
|
104 |
+
|
105 |
+
ts = model.us[30:-10]
|
106 |
+
fuse = EarlyLoopBreak(5)
|
107 |
+
|
108 |
+
same_noise = torch.randn(1, 4, H, W, device=model.device).repeat(bs, 1, 1, 1)
|
109 |
+
|
110 |
+
with tqdm(total=n_steps) as pbar, \
|
111 |
+
HeartBeat(pbar) as hbeat, \
|
112 |
+
EventStorage() as metric:
|
113 |
+
for i in range(n_steps):
|
114 |
+
if fuse.on_break():
|
115 |
+
break
|
116 |
+
|
117 |
+
p = f"{prompt_prefixes[i]} {model.prompt}"
|
118 |
+
score_conds = model.prompts_emb([p])
|
119 |
+
|
120 |
+
y, depth, ws = render_one_view(vox, aabb, H, W, Ks[i], poses[i], return_w=True)
|
121 |
+
|
122 |
+
if isinstance(model, StableDiffusion):
|
123 |
+
pass
|
124 |
+
else:
|
125 |
+
y = torch.nn.functional.interpolate(y, (target_H, target_W), mode='bilinear')
|
126 |
+
|
127 |
+
opt.zero_grad()
|
128 |
+
|
129 |
+
with torch.no_grad():
|
130 |
+
chosen_σs = np.random.choice(ts, bs, replace=False)
|
131 |
+
chosen_σs = chosen_σs.reshape(-1, 1, 1, 1)
|
132 |
+
chosen_σs = torch.as_tensor(chosen_σs, device=model.device, dtype=torch.float32)
|
133 |
+
# chosen_σs = us[i]
|
134 |
+
|
135 |
+
noise = torch.randn(bs, *y.shape[1:], device=model.device)
|
136 |
+
|
137 |
+
zs = y + chosen_σs * noise
|
138 |
+
Ds = model.denoise(zs, chosen_σs, **score_conds)
|
139 |
+
|
140 |
+
if var_red:
|
141 |
+
grad = (Ds - y) / chosen_σs
|
142 |
+
else:
|
143 |
+
grad = (Ds - zs) / chosen_σs
|
144 |
+
|
145 |
+
grad = grad.mean(0, keepdim=True)
|
146 |
+
|
147 |
+
y.backward(-grad, retain_graph=True)
|
148 |
+
|
149 |
+
if depth_weight > 0:
|
150 |
+
center_depth = depth[7:-7, 7:-7]
|
151 |
+
border_depth_mean = (depth.sum() - center_depth.sum()) / (64*64-50*50)
|
152 |
+
center_depth_mean = center_depth.mean()
|
153 |
+
depth_diff = center_depth_mean - border_depth_mean
|
154 |
+
depth_loss = - torch.log(depth_diff + 1e-12)
|
155 |
+
depth_loss = depth_weight * depth_loss
|
156 |
+
depth_loss.backward(retain_graph=True)
|
157 |
+
|
158 |
+
emptiness_loss = torch.log(1 + emptiness_scale * ws).mean()
|
159 |
+
emptiness_loss = emptiness_weight * emptiness_loss
|
160 |
+
if emptiness_step * n_steps <= i:
|
161 |
+
emptiness_loss *= emptiness_multiplier
|
162 |
+
emptiness_loss.backward()
|
163 |
+
|
164 |
+
opt.step()
|
165 |
+
|
166 |
+
metric.put_scalars(**tsr_stats(y))
|
167 |
+
|
168 |
+
if every(pbar, percent=1):
|
169 |
+
with torch.no_grad():
|
170 |
+
if isinstance(model, StableDiffusion):
|
171 |
+
y = model.decode(y)
|
172 |
+
vis_routine(metric, y, depth)
|
173 |
+
|
174 |
+
# if every(pbar, step=2500):
|
175 |
+
# metric.put_artifact(
|
176 |
+
# "ckpt", ".pt", lambda fn: torch.save(vox.state_dict(), fn)
|
177 |
+
# )
|
178 |
+
# with EventStorage("test"):
|
179 |
+
# evaluate(model, vox, poser)
|
180 |
+
|
181 |
+
metric.step()
|
182 |
+
pbar.update()
|
183 |
+
pbar.set_description(p)
|
184 |
+
hbeat.beat()
|
185 |
+
|
186 |
+
metric.put_artifact(
|
187 |
+
"ckpt", ".pt", lambda fn: torch.save(vox.state_dict(), fn)
|
188 |
+
)
|
189 |
+
with EventStorage("test"):
|
190 |
+
evaluate(model, vox, poser)
|
191 |
+
|
192 |
+
metric.step()
|
193 |
+
|
194 |
+
hbeat.done()
|
195 |
+
|
196 |
+
|
197 |
+
@torch.no_grad()
|
198 |
+
def evaluate(score_model, vox, poser):
|
199 |
+
H, W = poser.H, poser.W
|
200 |
+
vox.eval()
|
201 |
+
K, poses = poser.sample_test(100)
|
202 |
+
|
203 |
+
fuse = EarlyLoopBreak(5)
|
204 |
+
metric = get_event_storage()
|
205 |
+
hbeat = get_heartbeat()
|
206 |
+
|
207 |
+
aabb = vox.aabb.T.cpu().numpy()
|
208 |
+
vox = vox.to(device_glb)
|
209 |
+
|
210 |
+
num_imgs = len(poses)
|
211 |
+
|
212 |
+
for i in (pbar := tqdm(range(num_imgs))):
|
213 |
+
if fuse.on_break():
|
214 |
+
break
|
215 |
+
|
216 |
+
pose = poses[i]
|
217 |
+
y, depth = render_one_view(vox, aabb, H, W, K, pose)
|
218 |
+
if isinstance(score_model, StableDiffusion):
|
219 |
+
y = score_model.decode(y)
|
220 |
+
vis_routine(metric, y, depth)
|
221 |
+
|
222 |
+
metric.step()
|
223 |
+
hbeat.beat()
|
224 |
+
|
225 |
+
metric.flush_history()
|
226 |
+
|
227 |
+
metric.put_artifact(
|
228 |
+
"view_seq", ".mp4",
|
229 |
+
lambda fn: stitch_vis(fn, read_stats(metric.output_dir, "view")[1])
|
230 |
+
)
|
231 |
+
|
232 |
+
metric.step()
|
233 |
+
|
234 |
+
|
235 |
+
def render_one_view(vox, aabb, H, W, K, pose, return_w=False):
|
236 |
+
N = H * W
|
237 |
+
ro, rd = rays_from_img(H, W, K, pose)
|
238 |
+
ro, rd, t_min, t_max = scene_box_filter(ro, rd, aabb)
|
239 |
+
assert len(ro) == N, "for now all pixels must be in"
|
240 |
+
ro, rd, t_min, t_max = as_torch_tsrs(vox.device, ro, rd, t_min, t_max)
|
241 |
+
rgbs, depth, weights = render_ray_bundle(vox, ro, rd, t_min, t_max)
|
242 |
+
|
243 |
+
rgbs = rearrange(rgbs, "(h w) c -> 1 c h w", h=H, w=W)
|
244 |
+
depth = rearrange(depth, "(h w) 1 -> h w", h=H, w=W)
|
245 |
+
if return_w:
|
246 |
+
return rgbs, depth, weights
|
247 |
+
else:
|
248 |
+
return rgbs, depth
|
249 |
+
|
250 |
+
|
251 |
+
def scene_box_filter(ro, rd, aabb):
|
252 |
+
_, t_min, t_max = ray_box_intersect(ro, rd, aabb)
|
253 |
+
# do not render what's behind the ray origin
|
254 |
+
t_min, t_max = np.maximum(t_min, 0), np.maximum(t_max, 0)
|
255 |
+
return ro, rd, t_min, t_max
|
256 |
+
|
257 |
+
|
258 |
+
def vis_routine(metric, y, depth):
|
259 |
+
pane = nerf_vis(y, depth, final_H=256)
|
260 |
+
im = torch_samps_to_imgs(y)[0]
|
261 |
+
depth = depth.cpu().numpy()
|
262 |
+
metric.put_artifact("view", ".png", lambda fn: imwrite(fn, pane))
|
263 |
+
metric.put_artifact("img", ".png", lambda fn: imwrite(fn, im))
|
264 |
+
metric.put_artifact("depth", ".npy", lambda fn: np.save(fn, depth))
|
265 |
+
|
266 |
+
|
267 |
+
def evaluate_ckpt():
|
268 |
+
cfg = optional_load_config(fname="full_config.yml")
|
269 |
+
assert len(cfg) > 0, "can't find cfg file"
|
270 |
+
mod = SJC(**cfg)
|
271 |
+
|
272 |
+
family = cfg.pop("family")
|
273 |
+
model: ScoreAdapter = getattr(mod, family).make()
|
274 |
+
vox = mod.vox.make()
|
275 |
+
poser = mod.pose.make()
|
276 |
+
|
277 |
+
pbar = tqdm(range(1))
|
278 |
+
|
279 |
+
with EventStorage(), HeartBeat(pbar):
|
280 |
+
ckpt_fname = latest_ckpt()
|
281 |
+
state = torch.load(ckpt_fname, map_location="cpu")
|
282 |
+
vox.load_state_dict(state)
|
283 |
+
vox.to(device_glb)
|
284 |
+
|
285 |
+
with EventStorage("test"):
|
286 |
+
evaluate(model, vox, poser)
|
287 |
+
|
288 |
+
|
289 |
+
def latest_ckpt():
|
290 |
+
ts, ys = read_stats("./", "ckpt")
|
291 |
+
assert len(ys) > 0
|
292 |
+
return ys[-1]
|
293 |
+
|
294 |
+
|
295 |
+
if __name__ == "__main__":
|
296 |
+
seed_everything(0)
|
297 |
+
dispatch(SJC)
|
298 |
+
# evaluate_ckpt()
|
sd1/__init__.py
ADDED
File without changes
|
sd1/configs/v1-finetune_textual_inverison.yaml
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model:
|
2 |
+
base_learning_rate: 5.0e-03
|
3 |
+
target: ldm.models.diffusion.ddpm_textual_inversion.LatentDiffusion
|
4 |
+
params:
|
5 |
+
linear_start: 0.00085
|
6 |
+
linear_end: 0.0120
|
7 |
+
num_timesteps_cond: 1
|
8 |
+
log_every_t: 200
|
9 |
+
timesteps: 1000
|
10 |
+
first_stage_key: image
|
11 |
+
cond_stage_key: caption
|
12 |
+
image_size: 64
|
13 |
+
channels: 4
|
14 |
+
cond_stage_trainable: true # Note: different from the one we trained before
|
15 |
+
conditioning_key: crossattn
|
16 |
+
monitor: val/loss_simple_ema
|
17 |
+
scale_factor: 0.18215
|
18 |
+
use_ema: False
|
19 |
+
embedding_reg_weight: 0.0
|
20 |
+
|
21 |
+
personalization_config:
|
22 |
+
target: ldm.modules.embedding_manager.EmbeddingManager
|
23 |
+
params:
|
24 |
+
placeholder_strings: ["*"]
|
25 |
+
initializer_words: ["sculpture"]
|
26 |
+
per_image_tokens: false
|
27 |
+
num_vectors_per_token: 1
|
28 |
+
progressive_words: False
|
29 |
+
|
30 |
+
unet_config:
|
31 |
+
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
32 |
+
params:
|
33 |
+
image_size: 32 # unused
|
34 |
+
in_channels: 4
|
35 |
+
out_channels: 4
|
36 |
+
model_channels: 320
|
37 |
+
attention_resolutions: [ 4, 2, 1 ]
|
38 |
+
num_res_blocks: 2
|
39 |
+
channel_mult: [ 1, 2, 4, 4 ]
|
40 |
+
num_heads: 8
|
41 |
+
use_spatial_transformer: True
|
42 |
+
transformer_depth: 1
|
43 |
+
context_dim: 768
|
44 |
+
use_checkpoint: True
|
45 |
+
legacy: False
|
46 |
+
|
47 |
+
first_stage_config:
|
48 |
+
target: ldm.models.autoencoder.AutoencoderKL
|
49 |
+
params:
|
50 |
+
embed_dim: 4
|
51 |
+
monitor: val/rec_loss
|
52 |
+
ddconfig:
|
53 |
+
double_z: true
|
54 |
+
z_channels: 4
|
55 |
+
resolution: 256
|
56 |
+
in_channels: 3
|
57 |
+
out_ch: 3
|
58 |
+
ch: 128
|
59 |
+
ch_mult:
|
60 |
+
- 1
|
61 |
+
- 2
|
62 |
+
- 4
|
63 |
+
- 4
|
64 |
+
num_res_blocks: 2
|
65 |
+
attn_resolutions: []
|
66 |
+
dropout: 0.0
|
67 |
+
lossconfig:
|
68 |
+
target: torch.nn.Identity
|
69 |
+
|
70 |
+
cond_stage_config:
|
71 |
+
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
72 |
+
|
73 |
+
data:
|
74 |
+
target: main.DataModuleFromConfig
|
75 |
+
params:
|
76 |
+
batch_size: 2
|
77 |
+
num_workers: 2
|
78 |
+
wrap: false
|
79 |
+
train:
|
80 |
+
target: ldm.data.personalized.PersonalizedBase
|
81 |
+
params:
|
82 |
+
size: 512
|
83 |
+
set: train
|
84 |
+
per_image_tokens: false
|
85 |
+
repeats: 100
|
86 |
+
validation:
|
87 |
+
target: ldm.data.personalized.PersonalizedBase
|
88 |
+
params:
|
89 |
+
size: 512
|
90 |
+
set: val
|
91 |
+
per_image_tokens: false
|
92 |
+
repeats: 10
|
93 |
+
|
94 |
+
lightning:
|
95 |
+
callbacks:
|
96 |
+
image_logger:
|
97 |
+
target: main.ImageLogger
|
98 |
+
params:
|
99 |
+
batch_frequency: 500
|
100 |
+
max_images: 8
|
101 |
+
increase_log_steps: False
|
102 |
+
|
103 |
+
trainer:
|
104 |
+
benchmark: True
|
105 |
+
max_steps: 15000
|
106 |
+
gpus: 0,
|